This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#Use the data sets folder fo setwd
setwd("C:/Users/Arash/Documents/Data sets")
library("plyr")
## Warning: package 'plyr' was built under R version 3.2.5
library("dplyr")
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("sqldf")
## Warning: package 'sqldf' was built under R version 3.2.5
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.2.5
## Loading required package: proto
## Warning: package 'proto' was built under R version 3.2.5
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.2.5
library("nFactors")
## Warning: package 'nFactors' was built under R version 3.2.5
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.2.5
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: psych
## Warning: package 'psych' was built under R version 3.2.5
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
##
## logit
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'nFactors'
## The following object is masked from 'package:lattice':
##
## parallel
library("MASS")
library("psych")
library("ggmap")
## Warning: package 'ggmap' was built under R version 3.2.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.5
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library("ggplot2")
library("corrplot")
## Warning: package 'corrplot' was built under R version 3.2.5
library("lubridate")
## Warning: package 'lubridate' was built under R version 3.2.5
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:plyr':
##
## here
## The following object is masked from 'package:base':
##
## date
library("reshape")
## Warning: package 'reshape' was built under R version 3.2.5
##
## Attaching package: 'reshape'
## The following object is masked from 'package:lubridate':
##
## stamp
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:plyr':
##
## rename, round_any
library("sqldf")
library("maps")
## Warning: package 'maps' was built under R version 3.2.5
##
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
##
## ozone
library("zipcode")
## Warning: package 'zipcode' was built under R version 3.2.5
library("caret")
## Warning: package 'caret' was built under R version 3.2.5
library("rpart")
library("rpart.plot")
## Warning: package 'rpart.plot' was built under R version 3.2.5
library("cwhmisc")
## Warning: package 'cwhmisc' was built under R version 3.2.5
## Loading required package: grid
##
## Attaching package: 'cwhmisc'
## The following object is masked from 'package:ggplot2':
##
## %+%
## The following object is masked from 'package:psych':
##
## %+%
library("rattle")
## Warning: package 'rattle' was built under R version 3.2.5
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library("e1071")
## Warning: package 'e1071' was built under R version 3.2.5
library("broom")
library("randomForest")
## Warning: package 'randomForest' was built under R version 3.2.5
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
library("nnet")
library("xtable")
## Warning: package 'xtable' was built under R version 3.2.5
library("visreg")
## Warning: package 'visreg' was built under R version 3.2.5
################# DATA PREPARAATION ##########################
#combining and exploring data for group project
#exploration of data for group project
setwd("C:/Users/Arash/Documents/Data sets")
crime <- read.csv("Crime_2014.csv")
facilities <- read.csv("Facilities_by_Zipcode.csv")
home_sales <- read.csv("MC_Home_Sales_by_Zip_Code_2014.csv")
dropout <- read.csv("MCPS_Dropout_Attendance_by_Zipcode.csv")
irs <- read.csv("MC_IRS.csv")
Most_Data <- read.csv("Most_Data.csv")
data <- read.csv("Long_and_Foster_Columns_Removed.csv")
##### NOTE: Tthe followin part is the very first codes generated to clean and scrutizing data. it can be skipped######
#Make a table of number of incidents by zipcode using dplyr library
#it is sorting (TRUE) by most crime to least crime.
zip_code_tbl <- tbl_df(crime)
Incidents_by_zipcode <-zip_code_tbl %>% group_by(Zip.Code) %>% tally(sort = TRUE)
write.csv(Incidents_by_zipcode, "Crime_by_Zipcode_2014.csv")
#Rename the columns so they can be combined with Most_Data.csv
Incidents_by_zipcode$Zip <- Incidents_by_zipcode$Zip.Code
Incidents_by_zipcode$Number_of_Crimes_2014 <- Incidents_by_zipcode$n
#take out the original names of the columns and keep only the new ones
keeps <- c("Zip", "Number_of_Crimes_2014")
Incidents_by_zipcode <-Incidents_by_zipcode[keeps]
#counts the number of specific zipcodes, for example 20852
#length(which(crime$Zip.Code == 20852))
#same thing with the public facilities dataset
#facilities_tbl <- tbl_df(facilities)
#facilities_by_zipcode <-facilities_tbl %>% group_by(Zip) %>% tally(sort = TRUE)
#write.csv(facilities_by_zipcode, "Facilities_by_Zipcode.csv")
#try to combine facilities and crime
crime$Zip<-crime$Zip.Code
data_1 <- merge(crime, facilities, by = "Zip", all.x = TRUE, all.y = TRUE)
#add dropout
data_2 <- merge(data_1, dropout, by = "Zip", all.x = TRUE, all.y =TRUE)
#add irs
data_3 <- merge(data_2, irs, by = "Zip", all.x = TRUE, all.y = TRUE)
#combining the crime incidents 2014 with the other Most_Data_file
Most_Data_2014 <- merge(Incidents_by_zipcode, Most_Data, by = "Zip", all.x = TRUE, all.y = TRUE)
#write the file to csv
#write.csv(Most_Data_2014, "Most_Data_2014.csv")
###########
#cleaning Long_and_Foster housing sales dataset
#original file name Group_Project_L_F_Housing_Cleaning
#Natasha
data_1 <- read.csv("Long_and_Foster.csv")
Most_Data <- read.csv("Most_Data_2014.csv")
str(data_1)
## 'data.frame': 10894 obs. of 31 variables:
## $ ML. : Factor w/ 9731 levels "MC7402458","MC7720901",..: 4168 2917 5 1368 3826 38 2427 3841 12 21 ...
## $ City : Factor w/ 38 levels "ADELPHI","ASHTON",..: 10 4 31 4 4 4 4 10 31 10 ...
## $ State : Factor w/ 1 level "MD": 1 1 1 1 1 1 1 1 1 1 ...
## $ Zip.4 : int 4458 3026 NA 3065 4581 2258 3065 6660 1956 4208 ...
## $ Zip.Code : int 20815 20817 20854 20817 20817 20816 20817 20815 20854 20815 ...
## $ List.Price : num 8750000 7500000 4995000 4495000 4495000 ...
## $ Original.List.Price : num 8750000 5995000 5995000 4795000 4495000 ...
## $ Close.Price : num 8650000 7350000 4400000 4200000 4350000 4100000 4000000 3900000 3300000 3310000 ...
## $ Advertised.Subdivision: Factor w/ 1594 levels "0","10101 GROSVENOR PARK COD",..: 1018 107 180 107 1088 514 107 708 1088 214 ...
## $ Legal.Subdivision : Factor w/ 1318 levels "","10101 GROSVENOR PARK COD",..: 855 92 154 92 912 441 92 606 912 194 ...
## $ Status : Factor w/ 1 level "SOLD": 1 1 1 1 1 1 1 1 1 1 ...
## $ Close.Date : Factor w/ 312 levels "1/1/2014","1/10/2014",..: 155 205 150 205 218 118 213 211 172 168 ...
## $ DOMM : int 0 19 524 106 10 243 26 31 442 294 ...
## $ DOMP : int 0 343 524 106 10 243 26 31 442 294 ...
## $ Baths.All : int 4 9 13 11 10 8 7 8 10 8 ...
## $ Baths.Half : int 0 2 3 1 3 1 1 2 2 2 ...
## $ Baths.Full : int 4 7 10 10 7 7 6 6 8 6 ...
## $ Bedrooms : int 4 6 9 8 5 5 5 6 5 6 ...
## $ Condo.Coop.Fee : num 5010 NA NA NA NA NA NA NA NA NA ...
## $ Cooling : Factor w/ 290 levels "Air Purification System, Ceiling Fan(s), Central Air Conditioning, ENERGY STAR Cooling System, Heat Pump(s), Programmable Therm"| __truncated__,..: 218 68 199 281 199 199 138 138 199 199 ...
## $ Dining.Kitchen : Factor w/ 2339 levels "2nd Kitchen",..: 2181 328 11 190 190 1293 190 1947 190 190 ...
## $ Farm : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Fireplaces : int 1 5 5 5 6 4 4 5 3 3 ...
## $ Heating : Factor w/ 399 levels "90% Forced Air",..: 170 110 110 251 110 241 251 170 251 49 ...
## $ HOA : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ HOA.Fee : num NA NA NA NA NA NA NA NA 747 NA ...
## $ Lot.Sqft : int NA 124058 90169 42974 87120 35658 30755 18481 217800 11583 ...
## $ Total.Square.Footage : int 3400 0 18500 12000 11783 5116 8800 0 8500 0 ...
## $ Townhouse.Type : Factor w/ 9 levels "","Detached",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Type : Factor w/ 16 levels "Attach/Row Hse",..: 7 3 3 3 3 3 3 3 3 3 ...
## $ Parking : Factor w/ 1149 levels "Additional Storage Area, Gen Comm Elem, Unassigned",..: 575 819 575 575 488 399 769 355 575 874 ...
data_1 <- data_1[!is.na(data_1$Zip.Code),]
#this dataset is clean, there were no na fields in zipcode
glimpse(data_1)
## Observations: 10,894
## Variables: 31
## $ ML. <fctr> MC8320294, MC8291029, MC7919334, MC824...
## $ City <fctr> CHEVY CHASE, BETHESDA, POTOMAC, BETHES...
## $ State <fctr> MD, MD, MD, MD, MD, MD, MD, MD, MD, MD...
## $ Zip.4 <int> 4458, 3026, NA, 3065, 4581, 2258, 3065,...
## $ Zip.Code <int> 20815, 20817, 20854, 20817, 20817, 2081...
## $ List.Price <dbl> 8750000, 7500000, 4995000, 4495000, 449...
## $ Original.List.Price <dbl> 8750000, 5995000, 5995000, 4795000, 449...
## $ Close.Price <dbl> 8650000, 7350000, 4400000, 4200000, 435...
## $ Advertised.Subdivision <fctr> PARC SOMERSET CODM, BRADLEY HILLS GROV...
## $ Legal.Subdivision <fctr> PARC SOMERSET CODM, BRADLEY HILLS GROV...
## $ Status <fctr> SOLD, SOLD, SOLD, SOLD, SOLD, SOLD, SO...
## $ Close.Date <fctr> 4/15/2014, 6/12/2014, 4/1/2014, 6/12/2...
## $ DOMM <int> 0, 19, 524, 106, 10, 243, 26, 31, 442, ...
## $ DOMP <int> 0, 343, 524, 106, 10, 243, 26, 31, 442,...
## $ Baths.All <int> 4, 9, 13, 11, 10, 8, 7, 8, 10, 8, 9, 6,...
## $ Baths.Half <int> 0, 2, 3, 1, 3, 1, 1, 2, 2, 2, 2, 2, 0, ...
## $ Baths.Full <int> 4, 7, 10, 10, 7, 7, 6, 6, 8, 6, 7, 4, 7...
## $ Bedrooms <int> 4, 6, 9, 8, 5, 5, 5, 6, 5, 6, 6, 6, 4, ...
## $ Condo.Coop.Fee <dbl> 5010, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ Cooling <fctr> Heat Pump(s), Ceiling Fan(s), Central ...
## $ Dining.Kitchen <fctr> Sep Dining Rm, Breakfast Room, Gourmet...
## $ Farm <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
## $ Fireplaces <int> 1, 5, 5, 5, 6, 4, 4, 5, 3, 3, 6, 5, 4, ...
## $ Heating <fctr> Forced Air, Central, Forced Air, Zoned...
## $ HOA <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
## $ HOA.Fee <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 747.00,...
## $ Lot.Sqft <int> NA, 124058, 90169, 42974, 87120, 35658,...
## $ Total.Square.Footage <int> 3400, 0, 18500, 12000, 11783, 5116, 880...
## $ Townhouse.Type <fctr> , , , , , , , , , , , , , , , , , , , ...
## $ Type <fctr> Hi-Rise 9+ Floors, Detached, Detached,...
## $ Parking <fctr> Garage, Garage, Paved Driveway, Garage...
#this will list all of the levels in a column
df1 = data_1
factor1 <- sqldf("select distinct Type as 'Type' from df1")
## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
factor1
## Type
## 1 Hi-Rise 9+ Floors
## 2 Detached
## 3 Townhouse
## 4 House of Worship
## 5 Attach/Row Hse
## 6 Patio Home
## 7 Semi-Detached
## 8 Garden 1-4 Floors
## 9 Mid-Rise 5-8 Floors
## 10 Duplex
## 11 Other
## 12 Dwelling w/Rental
## 13 Back-to-Back
## 14 Multi-Family
## 15 Penthouse
## 16 Quad
#lists the number of times each level appears in a column (in this case Type is the column)
set.seed(1)
data_1 %>%
group_by(Type) %>%
summarise(no_rows = length(Type))
## # A tibble: 16 × 2
## Type no_rows
## <fctr> <int>
## 1 Attach/Row Hse 160
## 2 Back-to-Back 29
## 3 Detached 6301
## 4 Duplex 18
## 5 Dwelling w/Rental 2
## 6 Garden 1-4 Floors 963
## 7 Hi-Rise 9+ Floors 694
## 8 House of Worship 2
## 9 Mid-Rise 5-8 Floors 114
## 10 Multi-Family 9
## 11 Other 23
## 12 Patio Home 49
## 13 Penthouse 2
## 14 Quad 2
## 15 Semi-Detached 29
## 16 Townhouse 2497
#same thing to find how many sales in each zipcode in 2014
set.seed(1)
number_of_sales_by_zip<- data_1 %>%
group_by(Zip.Code) %>%
summarise(no_rows = length(Zip.Code))
#Rename the columns so they can be combined with Most_Data.csv
number_of_sales_by_zip$Zip <- number_of_sales_by_zip$Zip.Code
number_of_sales_by_zip$Number_of_Sales_2014 <- number_of_sales_by_zip$no_rows
#take out the original names of the columns and keep only the new ones
keeps <- c("Zip", "Number_of_Sales_2014")
Sales_by_zipcode <-number_of_sales_by_zip[keeps]
#combining the number of sales 2014 with the other Most_Data_file
Most_Data_2014 <- merge(number_of_sales_by_zip, Most_Data, by = "Zip", all.x = TRUE, all.y = TRUE)
row.has.na <- apply(Most_Data_2014, 1, function(x){any(is.na(x))})
# aggregating the different crime rates by state
crime<-read.csv("crime-original.csv")
b=matrix(0,55, 89-23+2)
for (i in 24:90)
b[,i-22] <- (aggregate(crime[,i]~Zip.Code,data = crime,FUN = "sum"))$`crime[, i]`
# 68 crime types for 55 states
#some modification on Crime file in excel( the new crime file is (crime-final))
crime_f<-read.csv("crime-final.csv")
#some modification on most_data_2014 in excel( the new file is most_data_2014)
#some modification on Crime file in excel( the new crime file is (crime-final))
crime_f<-read.csv("crime-final.csv")
#most_data
merge1<-read.csv("merge.csv")
housing<-data
colnames(facilities)[1]="Zip.Code"
data_1 <- merge(crime_f, facilities, by = "Zip.Code", all.x = TRUE, all.y = TRUE)
data_2 <- merge(data_1,merge1, by = "Zip.Code", all.x = TRUE, all.y = TRUE)
data_3 <- merge(data_2,housing, by = "Zip.Code", all.x = TRUE, all.y = TRUE)
data_4<-data_3[-c(1,2100,8099),]
#Normalizing crimes
a=data_4$IRS_Estimated_Population_2014
data_4[,2:24]<-apply(data_4[,2:24],2,FUN = function(x) x/a)
#The final data to work on
main=data_4
#removing NAs, Classify based on prices
nn= is.na(main$community_facilities_count)
main=main[!nn,]
main$price_dif= log(main$Close.Price) -log(main$Original.List.Price)
main$class=cut(main$Close.Price,c(0,300000,650000,8650000),c(1:3))
main$Date.Quarter=cut(main$Date.Quarter,c(0,1,2,3,4),c(1:4))
#nn= is.na(main$Lot.Sqft)
#main=main[!nn,]
############################ Crime Analysis (by Nibret)###########################################
summary(crime_f)
## Zip.Code ROB.FIREARM...STREET AGG.ASSLT.FIREARM.CITIZEN
## Min. :20812 Min. : 0.00 Min. : 0.00
## 1st Qu.:20842 1st Qu.: 0.00 1st Qu.: 0.50
## Median :20866 Median : 7.00 Median : 4.00
## Mean :20865 Mean :15.02 Mean : 9.14
## 3rd Qu.:20891 3rd Qu.:22.50 3rd Qu.:11.50
## Max. :20912 Max. :82.00 Max. :38.00
## BURG.FORCE.RES.NIGHT LARCENY.PICK.POCKET AUTO.THEFT...PASSENGER.VEHICLE
## Min. : 0.00 Min. : 0.0 Min. : 0.00
## 1st Qu.: 12.00 1st Qu.: 32.0 1st Qu.: 2.00
## Median : 46.00 Median : 175.0 Median : 7.00
## Mean : 55.91 Mean : 290.6 Mean :18.88
## 3rd Qu.: 95.50 3rd Qu.: 482.5 3rd Qu.:35.50
## Max. :222.00 Max. :1132.0 Max. :79.00
## ASSAULT...BATTERY...CITIZEN VANDALISM.MOTOR.VEHICLE
## Min. : 0.00 Min. : 0.0
## 1st Qu.: 7.50 1st Qu.: 10.0
## Median : 39.00 Median : 49.0
## Mean : 64.63 Mean : 65.3
## 3rd Qu.: 95.00 3rd Qu.:117.0
## Max. :222.00 Max. :219.0
## WEAPON.POSSESSION.HANDGUN SEX.OFFENSE...SEX..ASSAULT drug
## Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 6.50
## Median : 5.000 Median : 3.000 Median : 52.00
## Mean : 7.791 Mean : 4.791 Mean : 96.91
## 3rd Qu.:11.500 3rd Qu.: 8.500 3rd Qu.:152.00
## Max. :33.000 Max. :21.000 Max. :396.00
## FAMILY.OFFENSE...ABUSE.CHILD JUVENILE.RUNAWAY
## Min. : 0.000 Min. : 0.0
## 1st Qu.: 1.000 1st Qu.: 0.0
## Median : 5.000 Median : 5.0
## Mean : 6.233 Mean :12.3
## 3rd Qu.: 8.500 3rd Qu.:19.5
## Max. :24.000 Max. :56.0
## LIQUOR...UNLAWFUL.POSS.UNDER.21 DISORDERLY.CONDUCT
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.00 1st Qu.: 0.00
## Median : 8.00 Median : 6.00
## Mean : 26.12 Mean : 22.26
## 3rd Qu.: 26.50 3rd Qu.: 22.50
## Max. :207.00 Max. :189.00
## SUICIDE...POISON.OVERDOSE LITTERING.TRASH.DUMPING TRESPASSING
## Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 6.000 Median : 0.000 Median : 2.00
## Mean : 6.674 Mean : 1.116 Mean : 10.79
## 3rd Qu.:10.500 3rd Qu.: 1.000 3rd Qu.: 14.00
## Max. :27.000 Max. :13.000 Max. :129.00
## HARASSMENT.STALKING DRIVING.UNDER.THE.INFLUENCE FIRE.OTHER
## Min. : 0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 5.00 1st Qu.: 1.00
## Median : 2.000 Median : 36.00 Median : 7.00
## Mean : 2.465 Mean : 74.91 Mean :11.12
## 3rd Qu.: 4.000 3rd Qu.:151.00 3rd Qu.:16.50
## Max. :10.000 Max. :246.00 Max. :43.00
## POL.INFORMATION LOST.PROPERTY RECOVERED.PROPERTY.MONT..CO.
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 10.50 1st Qu.: 3.50 1st Qu.: 1.00
## Median : 44.00 Median : 17.00 Median : 7.00
## Mean : 60.09 Mean : 36.56 Mean :13.28
## 3rd Qu.: 85.50 3rd Qu.: 57.50 3rd Qu.:21.50
## Max. :262.00 Max. :208.00 Max. :91.00
##Focusing on Mean
crime.mean <- lapply(crime_f, mean, na.rm = TRUE)
print(crime.mean)
## $Zip.Code
## [1] 20864.53
##
## $ROB.FIREARM...STREET
## [1] 15.02326
##
## $AGG.ASSLT.FIREARM.CITIZEN
## [1] 9.139535
##
## $BURG.FORCE.RES.NIGHT
## [1] 55.90698
##
## $LARCENY.PICK.POCKET
## [1] 290.6279
##
## $AUTO.THEFT...PASSENGER.VEHICLE
## [1] 18.88372
##
## $ASSAULT...BATTERY...CITIZEN
## [1] 64.62791
##
## $VANDALISM.MOTOR.VEHICLE
## [1] 65.30233
##
## $WEAPON.POSSESSION.HANDGUN
## [1] 7.790698
##
## $SEX.OFFENSE...SEX..ASSAULT
## [1] 4.790698
##
## $drug
## [1] 96.90698
##
## $FAMILY.OFFENSE...ABUSE.CHILD
## [1] 6.232558
##
## $JUVENILE.RUNAWAY
## [1] 12.30233
##
## $LIQUOR...UNLAWFUL.POSS.UNDER.21
## [1] 26.11628
##
## $DISORDERLY.CONDUCT
## [1] 22.25581
##
## $SUICIDE...POISON.OVERDOSE
## [1] 6.674419
##
## $LITTERING.TRASH.DUMPING
## [1] 1.116279
##
## $TRESPASSING
## [1] 10.7907
##
## $HARASSMENT.STALKING
## [1] 2.465116
##
## $DRIVING.UNDER.THE.INFLUENCE
## [1] 74.90698
##
## $FIRE.OTHER
## [1] 11.11628
##
## $POL.INFORMATION
## [1] 60.09302
##
## $LOST.PROPERTY
## [1] 36.55814
##
## $RECOVERED.PROPERTY.MONT..CO.
## [1] 13.27907
##Crime Correlation
crime_corelation <- cor(crime_f)
corrplot(crime_corelation, method="circle")
##Crime by Month
data = read.csv("date-crime.csv")
df <- data.frame(data)
dat.m <- melt(df,id.vars = "Month")
ggplot(dat.m, aes(x = Month, y = value, fill=variable)) +
geom_bar(stat='identity') + guides(fill=FALSE) + scale_x_discrete(breaks = 1:12, labels=c("Jan","foo","bar","baz","phi","fum", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec"))
##########################Maps for crime and housing prices( By Ashley)##############################
data(zipcode)
sale_count <- main
sale_count$Zip.Code<- clean.zipcodes(sale_count$Zip.Code)
#combine current dataset with zipcode dataset
sale_count<- merge(sale_count, zipcode, by.x='Zip.Code', by.y='zip')
#group housing sale count by zip code
density<- ddply(sale_count, .(Zip.Code), "nrow")
names(density)[2] <- "count"
#combine current dataset with 'count' field
Sale <- merge(sale_count, density)
#remove duplicates (only show unique zip codes)
Sale<-Sale[!duplicated(Sale$Zip.Code),]
#map of housing sales in Montgomery County
moco <- get_map("montgomery county")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=montgomery+county&zoom=10&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=montgomery%20county&sensor=false
moco_map <- ggmap(moco)
moco_housing <- moco_map + stat_density2d(aes(x = longitude, y = latitude, fill = ..level..,
alpha = ..level..),
bins = 4, data = Sale,
geom = "polygon") + xlim(-77.5,-76.8) + ylim(38.9,39.3) + labs(title="House Sales By Zip Code", x="Longitude", y="Latitude")
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
moco_housing
## Warning: Removed 1 rows containing missing values (geom_rect).
#CRIME COUNTS (using just the crime dataset, since each row is a crime incident)
crime_count <- read.csv("crime_2014.csv", header=TRUE, sep=",")
crime_count$Zip.Code<- clean.zipcodes(crime_count$Zip.Code)
#combine current dataset with zipcode dataset
crime_count<- merge(crime_count, zipcode, by.x='Zip.Code', by.y='zip')
#group crime count by zip code
density<- ddply(crime_count, .(Zip.Code), "nrow")
names(density)[2] <- "count"
#combine current dataset with 'count' field
Crime <- merge(crime_count, density)
#remove duplicates (only show unique zip codes)
Crime<-Crime[!duplicated(Crime$Zip.Code),]
#map of crime counts in Montgomery County
moco_crime <- moco_map + stat_density2d(aes(x = longitude, y = latitude, fill = ..level..,
alpha = ..level..),
bins = 4, data = Crime,
geom = "polygon") + xlim(-77.5,-76.8) + ylim(38.9,39.3) + labs(title="Crime Counts By Zip Code", x="Longitude", y="Latitude")
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
moco_crime
## Warning: Removed 2 rows containing non-finite values (stat_density2d).
## Warning: Removed 1 rows containing missing values (geom_rect).
#MEDIAN HOUSING SALES
dataset <- main
main$Zip.Code<- clean.zipcodes(main$Zip.Code)
#combine current dataset with zipcode dataset
median_sale<- merge(main, zipcode, by.x='Zip.Code', by.y='zip')
#group median price by zip code
density<- ddply(median_sale, .(Zip.Code), "nrow")
names(density)[2] <- "count"
#combine current dataset with 'count' field
Median <- merge(median_sale, density)
#remove duplicates (only show unique zip codes)
Median<-Median[!duplicated(Median$Zip.Code),]
#map of median price in Montgomery County
moco_median <- moco_map + stat_bin2d(
aes(x = longitude, y = latitude, colour = Median_Sales,
fill = Median_Sales),
size = 0.25, bins = 20, alpha = 0.5,
data = Median) + labs(title="Median Housing Prices By Zip Code", x="Longitude", y="Latitude")
moco_median
################ CLASSIFICATION MODELS and two additiona regression models(By Arash)#######################
#training and testing data
index=nrow(main)
index2=sample(index, round(index/5))
train=main[-index2,]
test=main[index2,]
#SVM
base1 <- sum(test$class == 2) / nrow(test)
results <- data.frame(model=c("MFC"), score=c(base1))
#performane function
performance1 = function (M,df,name){
pr=predict(M,test)
ac=confusionMatrix(pr,test$class)$overal[1]
df <- rbind(df, data.frame(model=c(name), score=ac) )
return(df)
}
#+ number of bedrooms
train1=train[,c(45,54)]
M1= svm(class~.,data = train1)
#+number of bathrooms
train1=train[,c(45,44,54)]
M2= svm(class~.,data = train1)
#+type of house
train1=train[,c(45,44,50,54)]
M3= svm(class~.,data = train1)
#+garage
train1=train[,c(45,44,50,52,54)]
M4= svm(class~.,data = train1)
#+total crime
train1=train[,c(27,45,44,50,52,54)]
M5= svm(class~.,data = train1)
#+facilies
train1=train[,c(25,27,45,44,50,52,54)]
M6= svm(class~.,data = train1)
#+crimes in details
train1=train[,c(2:24,25,27,45,44,50,52,54)]
M7= svm(class~.,data = train1)
results<-performance1(M1,results,"+number of bedrooms")
results<-performance1(M2,results,"+number of bathrooms")
results<-performance1(M3,results,"+type of house")
results<-performance1(M4,results,"+garage")
results<-performance1(M5,results,"+total crime")
results<-performance1(M6,results,"+facilies")
results<-performance1(M7,results,"+crime in details")
#The result of feature engineering for model accuracy
results
## model score
## 1 MFC 0.4979253
## Accuracy +number of bedrooms 0.6274781
## Accuracy1 +number of bathrooms 0.6597510
## Accuracy2 +type of house 0.6860304
## Accuracy3 +garage 0.7123098
## Accuracy4 +total crime 0.7183034
## Accuracy5 +facilies 0.7349009
## Accuracy6 +crime in details 0.8363301
# AS we can see the detailed crime give an additional 10 % accuracy
#Error Analysis: contingency table
test$pr=predict(M7,test)
confusionMatrix(test$pr,test$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 424 81 0
## 2 83 928 120
## 3 0 71 462
##
## Overall Statistics
##
## Accuracy : 0.8363
## 95% CI : (0.8201, 0.8517)
## No Information Rate : 0.4979
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.736
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.8363 0.8593 0.7938
## Specificity 0.9513 0.8136 0.9553
## Pos Pred Value 0.8396 0.8205 0.8668
## Neg Pred Value 0.9501 0.8536 0.9267
## Prevalence 0.2337 0.4979 0.2683
## Detection Rate 0.1955 0.4278 0.2130
## Detection Prevalence 0.2328 0.5214 0.2457
## Balanced Accuracy 0.8938 0.8364 0.8745
#Decision tree
#using the final model obtained by feature engineering by a decision model and gain the accuracy
train1=train[,c(2:24,25,27,45,44,50,52,54)]
M8= rpart(class~.,data = train1,method = "class",parms=list(split="information"),
control=rpart.control(usesurrogate=0, maxsurrogate=0))
fancyRpartPlot(M8)
test$pr=predict(M8,test,type = "class")
c(Accuracy=mean(test$pr==test$class))
## Accuracy
## 0.8003688
#Error Analysis: contingency table
confusionMatrix(test$pr,test$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 405 97 0
## 2 102 894 145
## 3 0 89 437
##
## Overall Statistics
##
## Accuracy : 0.8004
## 95% CI : (0.7829, 0.817)
## No Information Rate : 0.4979
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6774
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.7988 0.8278 0.7509
## Specificity 0.9416 0.7732 0.9439
## Pos Pred Value 0.8068 0.7835 0.8308
## Neg Pred Value 0.9388 0.8191 0.9117
## Prevalence 0.2337 0.4979 0.2683
## Detection Rate 0.1867 0.4122 0.2015
## Detection Prevalence 0.2314 0.5260 0.2425
## Balanced Accuracy 0.8702 0.8005 0.8474
#Random forest
#Using Random forrest as an improvised version of Decision Trees
M9 <- randomForest(class ~ . , data = train1)
test$pr=predict(M9,test,type = "class")
c(Accuracy=mean(test$pr==test$class))
## Accuracy
## 0.8497003
confusionMatrix(test$pr,test$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 435 71 0
## 2 71 938 112
## 3 1 71 470
##
## Overall Statistics
##
## Accuracy : 0.8497
## 95% CI : (0.834, 0.8645)
## No Information Rate : 0.4979
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.758
## Mcnemar's Test P-Value : 0.01705
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.8580 0.8685 0.8076
## Specificity 0.9573 0.8320 0.9546
## Pos Pred Value 0.8597 0.8368 0.8672
## Neg Pred Value 0.9567 0.8645 0.9312
## Prevalence 0.2337 0.4979 0.2683
## Detection Rate 0.2006 0.4325 0.2167
## Detection Prevalence 0.2333 0.5168 0.2499
## Balanced Accuracy 0.9076 0.8502 0.8811
#Multinomial logisitc regression
train1=train[,c(2:24,25,27,45,44,50,52,54)]
test1=test[,c(2:24,25,27,45,44,50,52,54)]
M10=multinom(class~.,data=train1)
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7123.458885
## iter 20 value 5901.535115
## iter 30 value 5190.619980
## iter 40 value 4733.458334
## iter 50 value 4468.519069
## iter 60 value 3897.529378
## iter 70 value 3834.239842
## iter 80 value 3805.719500
## iter 90 value 3758.113407
## iter 100 value 3715.500857
## final value 3715.500857
## stopped after 100 iterations
summary(M10)
## Call:
## multinom(formula = class ~ ., data = train1)
##
## Coefficients:
## (Intercept) ROB.FIREARM...STREET AGG.ASSLT.FIREARM.CITIZEN
## 2 -7.135703 -213.75731 -99.34253
## 3 -16.302249 17.51108 -154.08095
## BURG.FORCE.RES.NIGHT LARCENY.PICK.POCKET AUTO.THEFT...PASSENGER.VEHICLE
## 2 -144.7899 212.2837 231.20507
## 3 159.0209 479.3184 -98.78393
## ASSAULT...BATTERY...CITIZEN VANDALISM.MOTOR.VEHICLE
## 2 -187.477 -270.1950
## 3 -1353.464 205.1457
## WEAPON.POSSESSION.HANDGUN SEX.OFFENSE...SEX..ASSAULT drug
## 2 26.01334 168.1376 -573.8341
## 3 -278.20594 -212.9966 -1386.4735
## FAMILY.OFFENSE...ABUSE.CHILD JUVENILE.RUNAWAY
## 2 2.423389 126.1159
## 3 -246.872710 -648.3282
## LIQUOR...UNLAWFUL.POSS.UNDER.21 DISORDERLY.CONDUCT
## 2 107.6619 -431.0789
## 3 -465.9298 293.9566
## SUICIDE...POISON.OVERDOSE LITTERING.TRASH.DUMPING TRESPASSING
## 2 -69.27394 12.49951 87.00731
## 3 60.80521 -30.81818 -28.40027
## HARASSMENT.STALKING DRIVING.UNDER.THE.INFLUENCE FIRE.OTHER
## 2 -29.86632 125.0875 -14.404768
## 3 99.37772 -311.4888 7.409725
## POL.INFORMATION LOST.PROPERTY RECOVERED.PROPERTY.MONT..CO.
## 2 376.7549 80.24806 248.39233
## 3 651.7004 425.63089 75.55131
## community_facilities_count Number_of_Crimes_2014 Bedrooms Baths.All
## 2 -0.015218174 0.0002911417 0.5840812 1.435588
## 3 0.005355814 0.0011295990 0.8755583 2.574569
## Type.yBack-to-Back Type.yDetached Type.yDuplex Type.yDwelling w/Rental
## 2 -29.44206 3.866120 -0.16771 19.13971
## 3 -10.58724 5.548716 -16.26789 -13.35261
## Type.yGarden 1-4 Floors Type.yHi-Rise 9+ Floors Type.yHouse of Worship
## 2 0.3754771 -0.3850387 23.167452
## 3 -0.2276385 -0.3488163 -4.445758
## Type.yMid-Rise 5-8 Floors Type.yMulti-Family Type.yOther
## 2 1.3832287 -1.5808008 0.8849762
## 3 0.4042231 -0.0937052 0.5258798
## Type.yPatio Home Type.yPenthouse Type.yQuad Type.ySemi-Detached
## 2 0.7332469 35.3289469 -15.812324 0.4324783
## 3 2.3358770 -0.8937585 -1.038738 1.2834926
## Type.yTownhouse Has.GarageTRUE
## 2 0.3647558 1.984681
## 3 0.1220644 3.459350
##
## Std. Errors:
## (Intercept) ROB.FIREARM...STREET AGG.ASSLT.FIREARM.CITIZEN
## 2 0.03711233 1.713773e-05 1.442724e-05
## 3 0.01027058 8.740511e-06 6.355311e-06
## BURG.FORCE.RES.NIGHT LARCENY.PICK.POCKET AUTO.THEFT...PASSENGER.VEHICLE
## 2 1.056690e-04 0.0008601332 2.100343e-05
## 3 3.762445e-05 0.0003568732 1.149117e-05
## ASSAULT...BATTERY...CITIZEN VANDALISM.MOTOR.VEHICLE
## 2 4.130097e-05 9.567034e-05
## 3 1.220721e-05 3.952020e-05
## WEAPON.POSSESSION.HANDGUN SEX.OFFENSE...SEX..ASSAULT drug
## 2 8.192552e-06 2.753739e-06 6.989767e-05
## 3 2.906017e-06 1.453405e-06 2.964995e-05
## FAMILY.OFFENSE...ABUSE.CHILD JUVENILE.RUNAWAY
## 2 6.407365e-06 5.554892e-06
## 3 3.224107e-06 1.217695e-06
## LIQUOR...UNLAWFUL.POSS.UNDER.21 DISORDERLY.CONDUCT
## 2 4.095796e-05 4.834967e-05
## 3 1.803187e-05 2.709065e-05
## SUICIDE...POISON.OVERDOSE LITTERING.TRASH.DUMPING TRESPASSING
## 2 1.468976e-05 2.017818e-06 1.510837e-05
## 3 5.998123e-06 1.376145e-06 7.562140e-06
## HARASSMENT.STALKING DRIVING.UNDER.THE.INFLUENCE FIRE.OTHER
## 2 6.529590e-06 2.208542e-04 1.862995e-05
## 3 2.612887e-06 7.449833e-05 5.584414e-06
## POL.INFORMATION LOST.PROPERTY RECOVERED.PROPERTY.MONT..CO.
## 2 6.244082e-05 1.199309e-04 3.292426e-05
## 3 1.504916e-05 4.048773e-05 1.150863e-05
## community_facilities_count Number_of_Crimes_2014 Bedrooms Baths.All
## 2 0.007503704 5.321357e-05 0.03901113 0.04347633
## 3 0.011160954 6.934534e-05 0.04915004 0.04913822
## Type.yBack-to-Back Type.yDetached Type.yDuplex Type.yDwelling w/Rental
## 2 1.209654e-16 0.05233219 2.019629e-04 1.865602e-13
## 3 3.135699e-10 0.04321516 1.042249e-11 6.672444e-18
## Type.yGarden 1-4 Floors Type.yHi-Rise 9+ Floors Type.yHouse of Worship
## 2 0.041166218 0.05485336 8.439325e-15
## 3 0.004123571 0.01764819 9.859520e-17
## Type.yMid-Rise 5-8 Floors Type.yMulti-Family Type.yOther
## 2 0.0018346724 0.0001945807 0.0005766579
## 3 0.0008557773 0.0001943774 0.0002634886
## Type.yPatio Home Type.yPenthouse Type.yQuad Type.ySemi-Detached
## 2 0.0009542987 1.609874e-19 7.759981e-13 0.0011475491
## 3 0.0001490200 8.687959e-22 1.591584e-09 0.0009517327
## Type.yTownhouse Has.GarageTRUE
## 2 0.05229843 0.05013745
## 3 0.04036205 0.03938284
##
## Residual Deviance: 7431.002
## AIC: 7607.002
#tidy(M10)
pr1=predict(M10,test1,"probs")
# A sample of Class predicted probabilities for some records
head(pr1)
## 1 2 3
## 2892 1.197786e-02 0.82612312 1.618990e-01
## 4048 1.004290e-03 0.27800720 7.209885e-01
## 6246 2.221725e-04 0.50038952 4.993883e-01
## 9911 9.067750e-01 0.09319558 2.939987e-05
## 2200 1.091404e-06 0.05158395 9.484150e-01
## 9803 8.630302e-01 0.13695430 1.545830e-05
test1$pr=apply(pr1,1,which.max)
c(Accuracy=mean(test1$pr==test1$class))
## Accuracy
## 0.7999078
confusionMatrix(test$pr,test$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 435 71 0
## 2 71 938 112
## 3 1 71 470
##
## Overall Statistics
##
## Accuracy : 0.8497
## 95% CI : (0.834, 0.8645)
## No Information Rate : 0.4979
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.758
## Mcnemar's Test P-Value : 0.01705
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.8580 0.8685 0.8076
## Specificity 0.9573 0.8320 0.9546
## Pos Pred Value 0.8597 0.8368 0.8672
## Neg Pred Value 0.9567 0.8645 0.9312
## Prevalence 0.2337 0.4979 0.2683
## Detection Rate 0.2006 0.4325 0.2167
## Detection Prevalence 0.2333 0.5168 0.2499
## Balanced Accuracy 0.9076 0.8502 0.8811
#Cross-validation method for Average multninomial logit accuracy
Accuracy=numeric(30)
for ( i in 1:30){
index=nrow(main)
index2=sample(index, round(index/5))
train=main[-index2,]
test=main[index2,]
train1=train[,c(2:24,25,27,45,44,50,52,54)]
test1=test[,c(2:24,25,27,45,44,50,52,54)]
M10=multinom(class~.,data=train1)
pr1=predict(M10,test1,"probs")
test1$pr=apply(pr1,1,which.max)
Accuracy[i]=mean(test1$pr==test1$class)
}
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7105.839697
## iter 20 value 6023.496175
## iter 30 value 5445.020137
## iter 40 value 5007.965309
## iter 50 value 4749.463400
## iter 60 value 4244.785823
## iter 70 value 4186.857940
## iter 80 value 4138.058929
## iter 90 value 4118.467423
## iter 100 value 4082.469492
## final value 4082.469492
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7135.014947
## iter 20 value 6059.680516
## iter 30 value 5475.088847
## iter 40 value 5023.842596
## iter 50 value 4738.586909
## iter 60 value 4337.107049
## iter 70 value 4185.268935
## iter 80 value 4123.814479
## iter 90 value 4086.519351
## iter 100 value 4067.653114
## final value 4067.653114
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7208.317916
## iter 20 value 6005.214442
## iter 30 value 5464.227204
## iter 40 value 5018.584316
## iter 50 value 4751.177540
## iter 60 value 4225.319946
## iter 70 value 4174.280911
## iter 80 value 4143.157977
## iter 90 value 4097.287034
## iter 100 value 4058.421959
## final value 4058.421959
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7166.782192
## iter 20 value 6028.409496
## iter 30 value 5467.404732
## iter 40 value 5003.532293
## iter 50 value 4762.222360
## iter 60 value 4258.096296
## iter 70 value 4195.468185
## iter 80 value 4134.634417
## iter 90 value 4091.730016
## iter 100 value 4061.128377
## final value 4061.128377
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7022.418962
## iter 20 value 5997.342246
## iter 30 value 5472.542457
## iter 40 value 5023.248985
## iter 50 value 4738.037520
## iter 60 value 4269.994764
## iter 70 value 4180.103675
## iter 80 value 4128.367869
## iter 90 value 4096.169057
## iter 100 value 4070.347236
## final value 4070.347236
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7202.614986
## iter 20 value 6010.096109
## iter 30 value 5426.072320
## iter 40 value 4982.700701
## iter 50 value 4685.328886
## iter 60 value 4186.092120
## iter 70 value 4130.509278
## iter 80 value 4116.043807
## iter 90 value 4066.545502
## iter 100 value 4023.034098
## final value 4023.034098
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7140.846723
## iter 20 value 6217.601798
## iter 30 value 5466.521834
## iter 40 value 4990.748137
## iter 50 value 4770.106584
## iter 60 value 4263.999393
## iter 70 value 4227.458309
## iter 80 value 4175.355555
## iter 90 value 4150.559695
## iter 100 value 4108.548111
## final value 4108.548111
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7236.175801
## iter 20 value 6069.491718
## iter 30 value 5464.039860
## iter 40 value 5018.690733
## iter 50 value 4769.447439
## iter 60 value 4218.373407
## iter 70 value 4156.000673
## iter 80 value 4139.063770
## iter 90 value 4086.610972
## iter 100 value 4052.170363
## final value 4052.170363
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7190.349366
## iter 20 value 5730.693508
## iter 30 value 5076.572623
## iter 40 value 4617.835915
## iter 50 value 4346.226195
## iter 60 value 3799.541337
## iter 70 value 3743.762615
## iter 80 value 3682.870004
## iter 90 value 3625.771219
## iter 100 value 3571.836094
## final value 3571.836094
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7273.558162
## iter 20 value 6112.751289
## iter 30 value 5474.007824
## iter 40 value 5022.617767
## iter 50 value 4765.617697
## iter 60 value 4274.133519
## iter 70 value 4192.081064
## iter 80 value 4136.698999
## iter 90 value 4106.209165
## iter 100 value 4077.344476
## final value 4077.344476
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7174.034784
## iter 20 value 6106.183351
## iter 30 value 5483.298746
## iter 40 value 5021.305274
## iter 50 value 4754.061972
## iter 60 value 4250.456333
## iter 70 value 4213.615577
## iter 80 value 4156.869602
## iter 90 value 4106.622201
## iter 100 value 4067.507285
## final value 4067.507285
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7261.751908
## iter 20 value 6045.948425
## iter 30 value 5477.426182
## iter 40 value 5025.821033
## iter 50 value 4748.324486
## iter 60 value 4419.279502
## iter 70 value 4187.946854
## iter 80 value 4141.722413
## iter 90 value 4106.162089
## iter 100 value 4064.314214
## final value 4064.314214
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7080.283786
## iter 20 value 6058.801428
## iter 30 value 5482.469334
## iter 40 value 5024.469111
## iter 50 value 4772.232679
## iter 60 value 4299.205513
## iter 70 value 4209.622877
## iter 80 value 4150.647096
## iter 90 value 4102.727909
## iter 100 value 4079.964787
## final value 4079.964787
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7087.466181
## iter 20 value 5962.590047
## iter 30 value 5307.836619
## iter 40 value 4875.993200
## iter 50 value 4602.081792
## iter 60 value 4038.053169
## iter 70 value 3974.354781
## iter 80 value 3944.183305
## iter 90 value 3916.760358
## iter 100 value 3878.188579
## final value 3878.188579
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7200.227027
## iter 20 value 6077.972104
## iter 30 value 5509.781949
## iter 40 value 5049.285797
## iter 50 value 4792.212236
## iter 60 value 4261.471543
## iter 70 value 4233.531850
## iter 80 value 4179.810943
## iter 90 value 4132.787544
## iter 100 value 4097.884576
## final value 4097.884576
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7856.316274
## iter 20 value 6102.301223
## iter 30 value 5398.324846
## iter 40 value 4968.335143
## iter 50 value 4681.658270
## iter 60 value 4093.152759
## iter 70 value 4043.341653
## iter 80 value 4022.443177
## iter 90 value 3980.886936
## iter 100 value 3951.918703
## final value 3951.918703
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7804.713747
## iter 20 value 6188.629836
## iter 30 value 5363.594779
## iter 40 value 4909.150939
## iter 50 value 4625.239873
## iter 60 value 4080.241645
## iter 70 value 4040.213520
## iter 80 value 3984.152458
## iter 90 value 3953.513140
## iter 100 value 3916.372736
## final value 3916.372736
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7188.549569
## iter 20 value 5904.022514
## iter 30 value 5360.103007
## iter 40 value 4924.909673
## iter 50 value 4650.108939
## iter 60 value 4089.192050
## iter 70 value 4031.149680
## iter 80 value 4010.681316
## iter 90 value 3969.927233
## iter 100 value 3923.423301
## final value 3923.423301
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7375.872033
## iter 20 value 6090.585100
## iter 30 value 5467.543515
## iter 40 value 4980.042136
## iter 50 value 4734.314456
## iter 60 value 4195.292645
## iter 70 value 4168.339358
## iter 80 value 4106.998503
## iter 90 value 4063.648565
## iter 100 value 4050.394481
## final value 4050.394481
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7033.906586
## iter 20 value 5828.231196
## iter 30 value 5123.018317
## iter 40 value 4623.393673
## iter 50 value 4362.988424
## iter 60 value 3807.197740
## iter 70 value 3766.607725
## iter 80 value 3706.908738
## iter 90 value 3649.413003
## iter 100 value 3599.811434
## final value 3599.811434
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7127.284728
## iter 20 value 6012.426576
## iter 30 value 5407.461383
## iter 40 value 4976.596076
## iter 50 value 4730.332076
## iter 60 value 4198.119005
## iter 70 value 4152.836119
## iter 80 value 4115.832669
## iter 90 value 4070.209195
## iter 100 value 4042.579084
## final value 4042.579084
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7073.150645
## iter 20 value 5793.294560
## iter 30 value 5130.347001
## iter 40 value 4691.391994
## iter 50 value 4410.077179
## iter 60 value 4113.440869
## iter 70 value 3801.304935
## iter 80 value 3737.824758
## iter 90 value 3678.121273
## iter 100 value 3634.774243
## final value 3634.774243
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7225.487861
## iter 20 value 6053.719431
## iter 30 value 5215.611917
## iter 40 value 4698.567276
## iter 50 value 4451.473067
## iter 60 value 4109.102026
## iter 70 value 3855.275404
## iter 80 value 3833.125433
## iter 90 value 3794.294053
## iter 100 value 3738.240342
## final value 3738.240342
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7153.641837
## iter 20 value 6085.890561
## iter 30 value 5443.420213
## iter 40 value 5038.606641
## iter 50 value 4752.814594
## iter 60 value 4310.620702
## iter 70 value 4214.795374
## iter 80 value 4171.238959
## iter 90 value 4126.806942
## iter 100 value 4107.981210
## final value 4107.981210
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7042.298483
## iter 20 value 6192.491305
## iter 30 value 5418.478648
## iter 40 value 4983.261262
## iter 50 value 4716.474210
## iter 60 value 4181.572706
## iter 70 value 4125.474975
## iter 80 value 4097.046307
## iter 90 value 4063.649070
## iter 100 value 4028.598656
## final value 4028.598656
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7200.214615
## iter 20 value 6022.668539
## iter 30 value 5360.896200
## iter 40 value 4948.762893
## iter 50 value 4689.223030
## iter 60 value 4184.011941
## iter 70 value 4130.646980
## iter 80 value 4087.982500
## iter 90 value 4036.372147
## iter 100 value 3996.775720
## final value 3996.775720
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7094.502291
## iter 20 value 6151.584977
## iter 30 value 5502.131229
## iter 40 value 5039.699109
## iter 50 value 4766.266733
## iter 60 value 4591.709306
## iter 70 value 4277.711178
## iter 80 value 4162.506920
## iter 90 value 4112.279841
## iter 100 value 4067.416895
## final value 4067.416895
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7097.860746
## iter 20 value 6015.157372
## iter 30 value 5481.868758
## iter 40 value 5020.102096
## iter 50 value 4745.198321
## iter 60 value 4289.080117
## iter 70 value 4137.829026
## iter 80 value 4081.524922
## iter 90 value 4030.484729
## iter 100 value 3991.372766
## final value 3991.372766
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7268.362291
## iter 20 value 6055.808026
## iter 30 value 5503.086476
## iter 40 value 5053.442152
## iter 50 value 4789.741391
## iter 60 value 4419.993241
## iter 70 value 4227.328984
## iter 80 value 4184.518989
## iter 90 value 4150.726732
## iter 100 value 4115.177652
## final value 4115.177652
## stopped after 100 iterations
## # weights: 135 (88 variable)
## initial value 9529.362992
## iter 10 value 7681.985262
## iter 20 value 6057.289778
## iter 30 value 5409.816375
## iter 40 value 4946.592599
## iter 50 value 4671.682045
## iter 60 value 4133.695070
## iter 70 value 4087.186213
## iter 80 value 4034.487235
## iter 90 value 3980.387336
## iter 100 value 3943.950631
## final value 3943.950631
## stopped after 100 iterations
mean(Accuracy)
## [1] 0.8175042
################ Reression for price difference #########################
#add a variable "price_dif" which is the difference of original price and closing price
#The goal is to predict the reduction or increase in hosuing price in the market based on its attributes
# log of price is used because of its large amount
main$Median_Sales=as.numeric(main$Median_Sales)
index=nrow(main)
index2=sample(index, round(index/5))
train=main[-index2,]
test=main[index2,]
#effect of crime
train1=train[,c(2:24,53)]
ml1=glm(price_dif~.,data = train1)
summary(ml1)
##
## Call:
## glm(formula = price_dif ~ ., data = train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.76270 -0.02206 0.01200 0.03285 0.40470
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.035262 0.006923 -5.093 3.59e-07 ***
## ROB.FIREARM...STREET -2.420737 5.326994 -0.454 0.64953
## AGG.ASSLT.FIREARM.CITIZEN 16.961650 11.562539 1.467 0.14243
## BURG.FORCE.RES.NIGHT 4.768273 2.352337 2.027 0.04269 *
## LARCENY.PICK.POCKET 1.324213 0.314893 4.205 2.63e-05 ***
## AUTO.THEFT...PASSENGER.VEHICLE -9.702350 6.653880 -1.458 0.14484
## ASSAULT...BATTERY...CITIZEN -10.661660 3.342767 -3.189 0.00143 **
## VANDALISM.MOTOR.VEHICLE 1.477270 2.053873 0.719 0.47200
## WEAPON.POSSESSION.HANDGUN 18.591372 11.392240 1.632 0.10273
## SEX.OFFENSE...SEX..ASSAULT 18.631188 14.259604 1.307 0.19139
## drug 1.217763 1.449388 0.840 0.40082
## FAMILY.OFFENSE...ABUSE.CHILD -10.109937 11.205304 -0.902 0.36695
## JUVENILE.RUNAWAY 8.292250 5.914022 1.402 0.16091
## LIQUOR...UNLAWFUL.POSS.UNDER.21 0.185191 3.911979 0.047 0.96224
## DISORDERLY.CONDUCT 0.648324 2.023401 0.320 0.74866
## SUICIDE...POISON.OVERDOSE -2.132457 13.272665 -0.161 0.87236
## LITTERING.TRASH.DUMPING -71.657984 21.999301 -3.257 0.00113 **
## TRESPASSING 5.139563 5.322225 0.966 0.33423
## HARASSMENT.STALKING -69.408313 18.130170 -3.828 0.00013 ***
## DRIVING.UNDER.THE.INFLUENCE 0.163945 1.087251 0.151 0.88015
## FIRE.OTHER -1.413327 13.841924 -0.102 0.91868
## POL.INFORMATION 1.337901 1.228669 1.089 0.27623
## LOST.PROPERTY -7.285919 2.546216 -2.861 0.00423 **
## RECOVERED.PROPERTY.MONT..CO. 6.764239 4.123758 1.640 0.10098
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.003611934)
##
## Null deviance: 31.592 on 8672 degrees of freedom
## Residual deviance: 31.240 on 8649 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: -24134
##
## Number of Fisher Scoring iterations: 2
#eefect of housing attributes
train1=train[,c(25,27,31,36,40,44,45,50,52,53)]
ml1=glm(price_dif~.-Original.List.Price-Median_Sales+log(Original.List.Price)+log(Median_Sales),data = train1)
summary(ml1)
##
## Call:
## glm(formula = price_dif ~ . - Original.List.Price - Median_Sales +
## log(Original.List.Price) + log(Median_Sales), data = train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.75500 -0.02270 0.01059 0.03099 0.40618
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.478e-01 2.297e-02 6.437 1.29e-10 ***
## community_facilities_count -2.644e-04 1.167e-04 -2.265 0.023539 *
## Number_of_Crimes_2014 1.350e-07 7.811e-07 0.173 0.862809
## Date.Quarter2 1.433e-02 1.940e-03 7.386 1.66e-13 ***
## Date.Quarter3 3.126e-03 2.085e-03 1.499 0.133941
## Date.Quarter4 -5.806e-03 2.149e-03 -2.702 0.006906 **
## Baths.All -7.261e-04 4.564e-04 -1.591 0.111670
## Bedrooms -5.977e-04 9.497e-04 -0.629 0.529171
## Type.yBack-to-Back -1.153e-02 1.343e-02 -0.859 0.390508
## Type.yDetached -1.980e-03 5.518e-03 -0.359 0.719771
## Type.yDuplex -2.907e-02 1.789e-02 -1.625 0.104288
## Type.yDwelling w/Rental -1.231e-01 4.251e-02 -2.897 0.003782 **
## Type.yGarden 1-4 Floors -2.587e-02 5.806e-03 -4.456 8.44e-06 ***
## Type.yHi-Rise 9+ Floors -3.180e-02 6.043e-03 -5.261 1.46e-07 ***
## Type.yMid-Rise 5-8 Floors -2.616e-02 8.276e-03 -3.161 0.001576 **
## Type.yMulti-Family -1.353e-02 2.297e-02 -0.589 0.556009
## Type.yOther -4.800e-02 1.458e-02 -3.293 0.000997 ***
## Type.yPatio Home -1.377e-04 1.067e-02 -0.013 0.989696
## Type.yPenthouse -5.087e-02 4.215e-02 -1.207 0.227545
## Type.yQuad -4.121e-02 4.216e-02 -0.977 0.328386
## Type.ySemi-Detached -1.541e-04 1.297e-02 -0.012 0.990520
## Type.yTownhouse 3.384e-03 5.473e-03 0.618 0.536412
## Has.GarageTRUE 1.880e-03 1.485e-03 1.266 0.205610
## log(Original.List.Price) -1.425e-02 1.956e-03 -7.286 3.47e-13 ***
## log(Median_Sales) 4.952e-03 9.623e-04 5.146 2.72e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.003490452)
##
## Null deviance: 31.592 on 8672 degrees of freedom
## Residual deviance: 30.185 on 8648 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: -24430
##
## Number of Fisher Scoring iterations: 2
tidy(ml1)
## term estimate std.error statistic
## 1 (Intercept) 1.478356e-01 2.296758e-02 6.43670859
## 2 community_facilities_count -2.644223e-04 1.167438e-04 -2.26497929
## 3 Number_of_Crimes_2014 1.349851e-07 7.811420e-07 0.17280478
## 4 Date.Quarter2 1.432851e-02 1.939992e-03 7.38585991
## 5 Date.Quarter3 3.125864e-03 2.085463e-03 1.49888271
## 6 Date.Quarter4 -5.805675e-03 2.148684e-03 -2.70196835
## 7 Baths.All -7.260913e-04 4.564046e-04 -1.59089393
## 8 Bedrooms -5.976500e-04 9.497090e-04 -0.62929804
## 9 Type.yBack-to-Back -1.153305e-02 1.343020e-02 -0.85873985
## 10 Type.yDetached -1.979723e-03 5.517978e-03 -0.35877688
## 11 Type.yDuplex -2.906914e-02 1.789330e-02 -1.62458292
## 12 Type.yDwelling w/Rental -1.231304e-01 4.250939e-02 -2.89654632
## 13 Type.yGarden 1-4 Floors -2.587230e-02 5.805587e-03 -4.45644878
## 14 Type.yHi-Rise 9+ Floors -3.179547e-02 6.043154e-03 -5.26140305
## 15 Type.yMid-Rise 5-8 Floors -2.616366e-02 8.276083e-03 -3.16135739
## 16 Type.yMulti-Family -1.352679e-02 2.297341e-02 -0.58880225
## 17 Type.yOther -4.800006e-02 1.457847e-02 -3.29252991
## 18 Type.yPatio Home -1.377493e-04 1.066568e-02 -0.01291520
## 19 Type.yPenthouse -5.087148e-02 4.215425e-02 -1.20679355
## 20 Type.yQuad -4.121091e-02 4.216269e-02 -0.97742621
## 21 Type.ySemi-Detached -1.541212e-04 1.297047e-02 -0.01188246
## 22 Type.yTownhouse 3.383860e-03 5.473084e-03 0.61827302
## 23 Has.GarageTRUE 1.880142e-03 1.485310e-03 1.26582428
## 24 log(Original.List.Price) -1.425253e-02 1.956122e-03 -7.28611518
## 25 log(Median_Sales) 4.952213e-03 9.623255e-04 5.14608958
## p.value
## 1 1.285869e-10
## 2 2.353859e-02
## 3 8.628089e-01
## 4 1.655346e-13
## 5 1.339406e-01
## 6 6.906480e-03
## 7 1.116700e-01
## 8 5.291706e-01
## 9 3.905079e-01
## 10 7.197708e-01
## 11 1.042879e-01
## 12 3.782410e-03
## 13 8.437757e-06
## 14 1.463862e-07
## 15 1.575780e-03
## 16 5.560093e-01
## 17 9.968877e-04
## 18 9.896957e-01
## 19 2.275447e-01
## 20 3.283855e-01
## 21 9.905197e-01
## 22 5.364117e-01
## 23 2.056101e-01
## 24 3.470364e-13
## 25 2.718178e-07
ml2=xtable(ml1)
write.csv(ml2,"23.csv")
######################Regression for Days in Market
#The goal is to build a regression model to predict how long a certain house will be in Market before being sold
main$Median_Sales=as.numeric(main$Median_Sales)
index=nrow(main)
index2=sample(index, round(index/5))
train=main[-index2,]
test=main[index2,]
train1=train[,c(36,40,44,45,50,52,43)]
ml1=lm(DOMP~.-Original.List.Price+log(Original.List.Price),data = train1)
#ml2=step(ml1,direction = "both",scope = list(lower=DOMP~1,upper=DOMP~.),k=3)
summary(ml1)
##
## Call:
## lm(formula = DOMP ~ . - Original.List.Price + log(Original.List.Price),
## data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -337.80 -36.22 -20.03 12.89 865.59
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -107.2065 22.6302 -4.737 2.20e-06 ***
## Date.Quarter2 -19.6676 2.1704 -9.062 < 2e-16 ***
## Date.Quarter3 -13.8429 2.3216 -5.963 2.58e-09 ***
## Date.Quarter4 1.3023 2.3949 0.544 0.58662
## Baths.All 2.6765 0.5102 5.246 1.59e-07 ***
## Bedrooms 2.6131 1.0411 2.510 0.01209 *
## Type.yBack-to-Back 20.8632 15.1461 1.377 0.16841
## Type.yDetached 12.9059 6.3847 2.021 0.04327 *
## Type.yDuplex 30.2235 17.7212 1.705 0.08814 .
## Type.yDwelling w/Rental -13.9455 47.6832 -0.292 0.76994
## Type.yGarden 1-4 Floors 30.3590 6.7039 4.529 6.02e-06 ***
## Type.yHi-Rise 9+ Floors 38.1127 6.9583 5.477 4.44e-08 ***
## Type.yHouse of Worship -39.7240 66.6380 -0.596 0.55111
## Type.yMid-Rise 5-8 Floors 37.7133 9.2661 4.070 4.74e-05 ***
## Type.yMulti-Family 1.7928 30.3172 0.059 0.95285
## Type.yOther 21.6248 18.2179 1.187 0.23526
## Type.yPatio Home 5.1864 13.3213 0.389 0.69704
## Type.yPenthouse 142.4972 47.3342 3.010 0.00262 **
## Type.yQuad 56.7905 66.6152 0.853 0.39395
## Type.ySemi-Detached 20.3164 15.1529 1.341 0.18003
## Type.yTownhouse 8.9375 6.3335 1.411 0.15824
## Has.GarageTRUE 5.3143 1.6535 3.214 0.00131 **
## log(Original.List.Price) 9.9225 1.8302 5.421 6.07e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 66.31 on 8651 degrees of freedom
## Multiple R-squared: 0.04977, Adjusted R-squared: 0.04735
## F-statistic: 20.59 on 22 and 8651 DF, p-value: < 2.2e-16
tidy(ml1)
## term estimate std.error statistic
## 1 (Intercept) -107.206530 22.6301732 -4.73732696
## 2 Date.Quarter2 -19.667567 2.1704108 -9.06167963
## 3 Date.Quarter3 -13.842911 2.3216164 -5.96261779
## 4 Date.Quarter4 1.302255 2.3949307 0.54375496
## 5 Baths.All 2.676525 0.5102187 5.24583909
## 6 Bedrooms 2.613084 1.0410786 2.50997730
## 7 Type.yBack-to-Back 20.863189 15.1461445 1.37745874
## 8 Type.yDetached 12.905937 6.3847395 2.02137259
## 9 Type.yDuplex 30.223452 17.7212388 1.70549318
## 10 Type.yDwelling w/Rental -13.945487 47.6832022 -0.29246121
## 11 Type.yGarden 1-4 Floors 30.358961 6.7038788 4.52856651
## 12 Type.yHi-Rise 9+ Floors 38.112699 6.9582626 5.47732981
## 13 Type.yHouse of Worship -39.724007 66.6379837 -0.59611658
## 14 Type.yMid-Rise 5-8 Floors 37.713275 9.2661011 4.07002626
## 15 Type.yMulti-Family 1.792797 30.3172394 0.05913457
## 16 Type.yOther 21.624833 18.2178637 1.18701253
## 17 Type.yPatio Home 5.186374 13.3213271 0.38932861
## 18 Type.yPenthouse 142.497159 47.3342201 3.01044696
## 19 Type.yQuad 56.790479 66.6152141 0.85251514
## 20 Type.ySemi-Detached 20.316445 15.1529301 1.34076019
## 21 Type.yTownhouse 8.937468 6.3334762 1.41114729
## 22 Has.GarageTRUE 5.314255 1.6535164 3.21391149
## 23 log(Original.List.Price) 9.922493 1.8302322 5.42143969
## p.value
## 1 2.200052e-06
## 2 1.566354e-19
## 3 2.579601e-09
## 4 5.866241e-01
## 5 1.592566e-07
## 6 1.209200e-02
## 7 1.684061e-01
## 8 4.327194e-02
## 9 8.813838e-02
## 10 7.699410e-01
## 11 6.018017e-06
## 12 4.439015e-08
## 13 5.511129e-01
## 14 4.742603e-05
## 15 9.528463e-01
## 16 2.352553e-01
## 17 6.970427e-01
## 18 2.616161e-03
## 19 3.939518e-01
## 20 1.800335e-01
## 21 1.582371e-01
## 22 1.314204e-03
## 23 6.071356e-08
ml2=xtable(ml1)
write.csv(ml2,"results1.csv")
visreg(ml1)
########################## Regression models for housing price and PCA ( By Natasha)########################
main=data_4
#change the name of main so this works with my code
full <- main
names(full)
## [1] "Zip.Code"
## [2] "ROB.FIREARM...STREET"
## [3] "AGG.ASSLT.FIREARM.CITIZEN"
## [4] "BURG.FORCE.RES.NIGHT"
## [5] "LARCENY.PICK.POCKET"
## [6] "AUTO.THEFT...PASSENGER.VEHICLE"
## [7] "ASSAULT...BATTERY...CITIZEN"
## [8] "VANDALISM.MOTOR.VEHICLE"
## [9] "WEAPON.POSSESSION.HANDGUN"
## [10] "SEX.OFFENSE...SEX..ASSAULT"
## [11] "drug"
## [12] "FAMILY.OFFENSE...ABUSE.CHILD"
## [13] "JUVENILE.RUNAWAY"
## [14] "LIQUOR...UNLAWFUL.POSS.UNDER.21"
## [15] "DISORDERLY.CONDUCT"
## [16] "SUICIDE...POISON.OVERDOSE"
## [17] "LITTERING.TRASH.DUMPING"
## [18] "TRESPASSING"
## [19] "HARASSMENT.STALKING"
## [20] "DRIVING.UNDER.THE.INFLUENCE"
## [21] "FIRE.OTHER"
## [22] "POL.INFORMATION"
## [23] "LOST.PROPERTY"
## [24] "RECOVERED.PROPERTY.MONT..CO."
## [25] "community_facilities_count"
## [26] "Number_of_Sales_2014"
## [27] "Number_of_Crimes_2014"
## [28] "Type.x"
## [29] "IRS_Estimated_Population_2014"
## [30] "Total_Number_of_Sales_State_Planning"
## [31] "Median_Sales"
## [32] "Mean_Sales"
## [33] "ML."
## [34] "City"
## [35] "List.Price"
## [36] "Original.List.Price"
## [37] "Close.Price"
## [38] "Legal.Subdivision"
## [39] "Status"
## [40] "Date.Quarter"
## [41] "Close.Date"
## [42] "DOMM"
## [43] "DOMP"
## [44] "Baths.All"
## [45] "Bedrooms"
## [46] "Condo.Coop.Fee"
## [47] "HOA.Fee"
## [48] "Lot.Sqft"
## [49] "Total.Square.Footage"
## [50] "Type.y"
## [51] "Parking"
## [52] "Has.Garage"
#remove Condo.Coop.Fee, HOA.Fee, lot.sqft (46-48) because they are not needed and have NAs
#51 and 52 were for Arash's analysis and have been removed
full <- full[,-c(46:48,51,52)]
names(full)
## [1] "Zip.Code"
## [2] "ROB.FIREARM...STREET"
## [3] "AGG.ASSLT.FIREARM.CITIZEN"
## [4] "BURG.FORCE.RES.NIGHT"
## [5] "LARCENY.PICK.POCKET"
## [6] "AUTO.THEFT...PASSENGER.VEHICLE"
## [7] "ASSAULT...BATTERY...CITIZEN"
## [8] "VANDALISM.MOTOR.VEHICLE"
## [9] "WEAPON.POSSESSION.HANDGUN"
## [10] "SEX.OFFENSE...SEX..ASSAULT"
## [11] "drug"
## [12] "FAMILY.OFFENSE...ABUSE.CHILD"
## [13] "JUVENILE.RUNAWAY"
## [14] "LIQUOR...UNLAWFUL.POSS.UNDER.21"
## [15] "DISORDERLY.CONDUCT"
## [16] "SUICIDE...POISON.OVERDOSE"
## [17] "LITTERING.TRASH.DUMPING"
## [18] "TRESPASSING"
## [19] "HARASSMENT.STALKING"
## [20] "DRIVING.UNDER.THE.INFLUENCE"
## [21] "FIRE.OTHER"
## [22] "POL.INFORMATION"
## [23] "LOST.PROPERTY"
## [24] "RECOVERED.PROPERTY.MONT..CO."
## [25] "community_facilities_count"
## [26] "Number_of_Sales_2014"
## [27] "Number_of_Crimes_2014"
## [28] "Type.x"
## [29] "IRS_Estimated_Population_2014"
## [30] "Total_Number_of_Sales_State_Planning"
## [31] "Median_Sales"
## [32] "Mean_Sales"
## [33] "ML."
## [34] "City"
## [35] "List.Price"
## [36] "Original.List.Price"
## [37] "Close.Price"
## [38] "Legal.Subdivision"
## [39] "Status"
## [40] "Date.Quarter"
## [41] "Close.Date"
## [42] "DOMM"
## [43] "DOMP"
## [44] "Baths.All"
## [45] "Bedrooms"
## [46] "Total.Square.Footage"
## [47] "Type.y"
#how many NAs?
row.has.na <- apply(full, 1, function(x){any(is.na(x))})
#get rid of rows with na
full <- full[!row.has.na,]
#only 18 records have been removed
#remove $ from median_sales and mean_sales
full$median_sales_num<-substring(full$Median_Sales, 2)
full$mean_sales_num<-substring(full$Mean_Sales, 2)
#change from character to numeric
full$mean_sales_num <- as.numeric(gsub(",", "", full$mean_sales_num))
full$median_sales_num <- as.numeric(gsub(",","", full$median_sales_num))
#another try at omitting NAs because they seem to still be there
full <- na.omit(full)
#create a dataframe with only numeric and factor data
#get the indexes for all of the columns
names(full)
## [1] "Zip.Code"
## [2] "ROB.FIREARM...STREET"
## [3] "AGG.ASSLT.FIREARM.CITIZEN"
## [4] "BURG.FORCE.RES.NIGHT"
## [5] "LARCENY.PICK.POCKET"
## [6] "AUTO.THEFT...PASSENGER.VEHICLE"
## [7] "ASSAULT...BATTERY...CITIZEN"
## [8] "VANDALISM.MOTOR.VEHICLE"
## [9] "WEAPON.POSSESSION.HANDGUN"
## [10] "SEX.OFFENSE...SEX..ASSAULT"
## [11] "drug"
## [12] "FAMILY.OFFENSE...ABUSE.CHILD"
## [13] "JUVENILE.RUNAWAY"
## [14] "LIQUOR...UNLAWFUL.POSS.UNDER.21"
## [15] "DISORDERLY.CONDUCT"
## [16] "SUICIDE...POISON.OVERDOSE"
## [17] "LITTERING.TRASH.DUMPING"
## [18] "TRESPASSING"
## [19] "HARASSMENT.STALKING"
## [20] "DRIVING.UNDER.THE.INFLUENCE"
## [21] "FIRE.OTHER"
## [22] "POL.INFORMATION"
## [23] "LOST.PROPERTY"
## [24] "RECOVERED.PROPERTY.MONT..CO."
## [25] "community_facilities_count"
## [26] "Number_of_Sales_2014"
## [27] "Number_of_Crimes_2014"
## [28] "Type.x"
## [29] "IRS_Estimated_Population_2014"
## [30] "Total_Number_of_Sales_State_Planning"
## [31] "Median_Sales"
## [32] "Mean_Sales"
## [33] "ML."
## [34] "City"
## [35] "List.Price"
## [36] "Original.List.Price"
## [37] "Close.Price"
## [38] "Legal.Subdivision"
## [39] "Status"
## [40] "Date.Quarter"
## [41] "Close.Date"
## [42] "DOMM"
## [43] "DOMP"
## [44] "Baths.All"
## [45] "Bedrooms"
## [46] "Total.Square.Footage"
## [47] "Type.y"
## [48] "median_sales_num"
## [49] "mean_sales_num"
#get the classes for all of the columns
lapply(full, class)
## $Zip.Code
## [1] "integer"
##
## $ROB.FIREARM...STREET
## [1] "numeric"
##
## $AGG.ASSLT.FIREARM.CITIZEN
## [1] "numeric"
##
## $BURG.FORCE.RES.NIGHT
## [1] "numeric"
##
## $LARCENY.PICK.POCKET
## [1] "numeric"
##
## $AUTO.THEFT...PASSENGER.VEHICLE
## [1] "numeric"
##
## $ASSAULT...BATTERY...CITIZEN
## [1] "numeric"
##
## $VANDALISM.MOTOR.VEHICLE
## [1] "numeric"
##
## $WEAPON.POSSESSION.HANDGUN
## [1] "numeric"
##
## $SEX.OFFENSE...SEX..ASSAULT
## [1] "numeric"
##
## $drug
## [1] "numeric"
##
## $FAMILY.OFFENSE...ABUSE.CHILD
## [1] "numeric"
##
## $JUVENILE.RUNAWAY
## [1] "numeric"
##
## $LIQUOR...UNLAWFUL.POSS.UNDER.21
## [1] "numeric"
##
## $DISORDERLY.CONDUCT
## [1] "numeric"
##
## $SUICIDE...POISON.OVERDOSE
## [1] "numeric"
##
## $LITTERING.TRASH.DUMPING
## [1] "numeric"
##
## $TRESPASSING
## [1] "numeric"
##
## $HARASSMENT.STALKING
## [1] "numeric"
##
## $DRIVING.UNDER.THE.INFLUENCE
## [1] "numeric"
##
## $FIRE.OTHER
## [1] "numeric"
##
## $POL.INFORMATION
## [1] "numeric"
##
## $LOST.PROPERTY
## [1] "numeric"
##
## $RECOVERED.PROPERTY.MONT..CO.
## [1] "numeric"
##
## $community_facilities_count
## [1] "integer"
##
## $Number_of_Sales_2014
## [1] "integer"
##
## $Number_of_Crimes_2014
## [1] "integer"
##
## $Type.x
## [1] "factor"
##
## $IRS_Estimated_Population_2014
## [1] "integer"
##
## $Total_Number_of_Sales_State_Planning
## [1] "integer"
##
## $Median_Sales
## [1] "factor"
##
## $Mean_Sales
## [1] "factor"
##
## $ML.
## [1] "factor"
##
## $City
## [1] "factor"
##
## $List.Price
## [1] "numeric"
##
## $Original.List.Price
## [1] "numeric"
##
## $Close.Price
## [1] "numeric"
##
## $Legal.Subdivision
## [1] "factor"
##
## $Status
## [1] "factor"
##
## $Date.Quarter
## [1] "integer"
##
## $Close.Date
## [1] "factor"
##
## $DOMM
## [1] "integer"
##
## $DOMP
## [1] "integer"
##
## $Baths.All
## [1] "integer"
##
## $Bedrooms
## [1] "integer"
##
## $Total.Square.Footage
## [1] "integer"
##
## $Type.y
## [1] "factor"
##
## $median_sales_num
## [1] "numeric"
##
## $mean_sales_num
## [1] "numeric"
#reclass columns we will keep to numeric or factor
full$Zip.Code <- factor(full$Zip.Code)
#delete the columns with dollar signs for median and mean housing price, type of sale (type.y) was removed because it is all standard
full <- full[,-c(28,31,32)]
names(full)
## [1] "Zip.Code"
## [2] "ROB.FIREARM...STREET"
## [3] "AGG.ASSLT.FIREARM.CITIZEN"
## [4] "BURG.FORCE.RES.NIGHT"
## [5] "LARCENY.PICK.POCKET"
## [6] "AUTO.THEFT...PASSENGER.VEHICLE"
## [7] "ASSAULT...BATTERY...CITIZEN"
## [8] "VANDALISM.MOTOR.VEHICLE"
## [9] "WEAPON.POSSESSION.HANDGUN"
## [10] "SEX.OFFENSE...SEX..ASSAULT"
## [11] "drug"
## [12] "FAMILY.OFFENSE...ABUSE.CHILD"
## [13] "JUVENILE.RUNAWAY"
## [14] "LIQUOR...UNLAWFUL.POSS.UNDER.21"
## [15] "DISORDERLY.CONDUCT"
## [16] "SUICIDE...POISON.OVERDOSE"
## [17] "LITTERING.TRASH.DUMPING"
## [18] "TRESPASSING"
## [19] "HARASSMENT.STALKING"
## [20] "DRIVING.UNDER.THE.INFLUENCE"
## [21] "FIRE.OTHER"
## [22] "POL.INFORMATION"
## [23] "LOST.PROPERTY"
## [24] "RECOVERED.PROPERTY.MONT..CO."
## [25] "community_facilities_count"
## [26] "Number_of_Sales_2014"
## [27] "Number_of_Crimes_2014"
## [28] "IRS_Estimated_Population_2014"
## [29] "Total_Number_of_Sales_State_Planning"
## [30] "ML."
## [31] "City"
## [32] "List.Price"
## [33] "Original.List.Price"
## [34] "Close.Price"
## [35] "Legal.Subdivision"
## [36] "Status"
## [37] "Date.Quarter"
## [38] "Close.Date"
## [39] "DOMM"
## [40] "DOMP"
## [41] "Baths.All"
## [42] "Bedrooms"
## [43] "Total.Square.Footage"
## [44] "Type.y"
## [45] "median_sales_num"
## [46] "mean_sales_num"
#full <- full[,-c(33:36, 39)]
#check to see that the classes of the remaining columns are all numeric (including integer), factor
lapply(full, class)
## $Zip.Code
## [1] "factor"
##
## $ROB.FIREARM...STREET
## [1] "numeric"
##
## $AGG.ASSLT.FIREARM.CITIZEN
## [1] "numeric"
##
## $BURG.FORCE.RES.NIGHT
## [1] "numeric"
##
## $LARCENY.PICK.POCKET
## [1] "numeric"
##
## $AUTO.THEFT...PASSENGER.VEHICLE
## [1] "numeric"
##
## $ASSAULT...BATTERY...CITIZEN
## [1] "numeric"
##
## $VANDALISM.MOTOR.VEHICLE
## [1] "numeric"
##
## $WEAPON.POSSESSION.HANDGUN
## [1] "numeric"
##
## $SEX.OFFENSE...SEX..ASSAULT
## [1] "numeric"
##
## $drug
## [1] "numeric"
##
## $FAMILY.OFFENSE...ABUSE.CHILD
## [1] "numeric"
##
## $JUVENILE.RUNAWAY
## [1] "numeric"
##
## $LIQUOR...UNLAWFUL.POSS.UNDER.21
## [1] "numeric"
##
## $DISORDERLY.CONDUCT
## [1] "numeric"
##
## $SUICIDE...POISON.OVERDOSE
## [1] "numeric"
##
## $LITTERING.TRASH.DUMPING
## [1] "numeric"
##
## $TRESPASSING
## [1] "numeric"
##
## $HARASSMENT.STALKING
## [1] "numeric"
##
## $DRIVING.UNDER.THE.INFLUENCE
## [1] "numeric"
##
## $FIRE.OTHER
## [1] "numeric"
##
## $POL.INFORMATION
## [1] "numeric"
##
## $LOST.PROPERTY
## [1] "numeric"
##
## $RECOVERED.PROPERTY.MONT..CO.
## [1] "numeric"
##
## $community_facilities_count
## [1] "integer"
##
## $Number_of_Sales_2014
## [1] "integer"
##
## $Number_of_Crimes_2014
## [1] "integer"
##
## $IRS_Estimated_Population_2014
## [1] "integer"
##
## $Total_Number_of_Sales_State_Planning
## [1] "integer"
##
## $ML.
## [1] "factor"
##
## $City
## [1] "factor"
##
## $List.Price
## [1] "numeric"
##
## $Original.List.Price
## [1] "numeric"
##
## $Close.Price
## [1] "numeric"
##
## $Legal.Subdivision
## [1] "factor"
##
## $Status
## [1] "factor"
##
## $Date.Quarter
## [1] "integer"
##
## $Close.Date
## [1] "factor"
##
## $DOMM
## [1] "integer"
##
## $DOMP
## [1] "integer"
##
## $Baths.All
## [1] "integer"
##
## $Bedrooms
## [1] "integer"
##
## $Total.Square.Footage
## [1] "integer"
##
## $Type.y
## [1] "factor"
##
## $median_sales_num
## [1] "numeric"
##
## $mean_sales_num
## [1] "numeric"
#garage is still logical, not sure if that is ok or not
#split dataset to test, train
index <- 1:nrow(full)
testindex <- sample(index, trunc(length(index)/5))
testset <- full[testindex,]
trainset <- full[-testindex,]
#split for development set
index <- 1:nrow(trainset)
devsetindex <- sample(index, trunc(length(index)/5))
devset <- full[devsetindex,]
trainset <- full[-devsetindex,]
#now we have a testset, devset and trainset
#lm
#See if some of the factors are imporant
#Baseline model
mylogit <- lm(Close.Price ~ drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET + Zip.Code + AGG.ASSLT.FIREARM.CITIZEN + BURG.FORCE.RES.NIGHT, data = trainset
)
step<-stepAIC(mylogit, direction = "both")
## Start: AIC=229233
## Close.Price ~ drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE +
## LARCENY.PICK.POCKET + Zip.Code + AGG.ASSLT.FIREARM.CITIZEN +
## BURG.FORCE.RES.NIGHT
##
##
## Step: AIC=229233
## Close.Price ~ drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE +
## LARCENY.PICK.POCKET + Zip.Code + AGG.ASSLT.FIREARM.CITIZEN
##
##
## Step: AIC=229233
## Close.Price ~ drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE +
## LARCENY.PICK.POCKET + Zip.Code
##
##
## Step: AIC=229233
## Close.Price ~ drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE +
## Zip.Code
##
##
## Step: AIC=229233
## Close.Price ~ drug + LOST.PROPERTY + Zip.Code
##
##
## Step: AIC=229233
## Close.Price ~ drug + Zip.Code
##
##
## Step: AIC=229233
## Close.Price ~ Zip.Code
##
## Df Sum of Sq RSS AIC
## <none> 7.6993e+14 229233
## - Zip.Code 35 5.1898e+14 1.2889e+15 233856
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Close.Price ~ drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE +
## LARCENY.PICK.POCKET + Zip.Code + AGG.ASSLT.FIREARM.CITIZEN +
## BURG.FORCE.RES.NIGHT
##
## Final Model:
## Close.Price ~ Zip.Code
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 9072 7.699268e+14 229233
## 2 - BURG.FORCE.RES.NIGHT 0 0.000 9072 7.699268e+14 229233
## 3 - AGG.ASSLT.FIREARM.CITIZEN 0 0.000 9072 7.699268e+14 229233
## 4 - LARCENY.PICK.POCKET 0 2.500 9072 7.699268e+14 229233
## 5 - VANDALISM.MOTOR.VEHICLE 0 0.375 9072 7.699268e+14 229233
## 6 - LOST.PROPERTY 0 2.375 9072 7.699268e+14 229233
## 7 - drug 0 3.875 9072 7.699268e+14 229233
plot(mylogit)
#
#
#
#
#
#predict house price
devset$predicted_close_price1<-predict(mylogit, devset)
## Warning in predict.lm(mylogit, devset): prediction from a rank-deficient
## fit may be misleading
# how far off are the predicted prices
devset$difference1 <- devset$predicted_close_price1 - devset$Close.Price
devset$percent_error1 <- abs(devset$difference1/devset$Close.Price)
#Model 2 - The model from Model 1 final
mylogit <- lm(Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count + Number_of_Crimes_2014 + drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET + Zip.Code + AGG.ASSLT.FIREARM.CITIZEN + BURG.FORCE.RES.NIGHT, data = trainset)
step<-stepAIC(mylogit, direction = "both")
## Start: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + Number_of_Crimes_2014 + drug +
## LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code + AGG.ASSLT.FIREARM.CITIZEN + BURG.FORCE.RES.NIGHT
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + Number_of_Crimes_2014 + drug +
## LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code + AGG.ASSLT.FIREARM.CITIZEN
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + Number_of_Crimes_2014 + drug +
## LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + Number_of_Crimes_2014 + drug +
## LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + Number_of_Crimes_2014 + drug +
## LOST.PROPERTY + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + Number_of_Crimes_2014 + drug +
## Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + Number_of_Crimes_2014 + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## Zip.Code
##
## Df Sum of Sq RSS AIC
## <none> 5.6217e+14 226377
## - Date.Quarter 1 2.4396e+11 5.6241e+14 226379
## - Baths.All 1 1.5544e+13 5.7771e+14 226623
## - Bedrooms 1 3.3119e+13 5.9529e+14 226896
## - Total.Square.Footage 1 5.1436e+13 6.1361e+14 227172
## - Zip.Code 35 3.7703e+14 9.3920e+14 230981
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + Number_of_Crimes_2014 + drug +
## LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code + AGG.ASSLT.FIREARM.CITIZEN + BURG.FORCE.RES.NIGHT
##
## Final Model:
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## Zip.Code
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 9068 5.621702e+14 226376.6
## 2 - BURG.FORCE.RES.NIGHT 0 0.0000 9068 5.621702e+14 226376.6
## 3 - AGG.ASSLT.FIREARM.CITIZEN 0 0.0000 9068 5.621702e+14 226376.6
## 4 - LARCENY.PICK.POCKET 0 0.1250 9068 5.621702e+14 226376.6
## 5 - VANDALISM.MOTOR.VEHICLE 0 0.3125 9068 5.621702e+14 226376.6
## 6 - LOST.PROPERTY 0 0.0000 9068 5.621702e+14 226376.6
## 7 - drug 0 0.0625 9068 5.621702e+14 226376.6
## 8 - Number_of_Crimes_2014 0 0.5000 9068 5.621702e+14 226376.6
## 9 - community_facilities_count 0 0.4375 9068 5.621702e+14 226376.6
plot(mylogit)
#
#
#
#
#
#predict house price
devset$predicted_close_price2<-predict(mylogit, devset)
## Warning in predict.lm(mylogit, devset): prediction from a rank-deficient
## fit may be misleading
# how far off are the predicted prices
devset$difference2 <- devset$predicted_close_price2 - devset$Close.Price
devset$percent_error2 <- abs(devset$difference2/devset$Close.Price)
#Model 3
mylogit <- lm(Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count + drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET + Zip.Code + AGG.ASSLT.FIREARM.CITIZEN + BURG.FORCE.RES.NIGHT, data = trainset
)
step<-stepAIC(mylogit, direction = "both")
## Start: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE +
## LARCENY.PICK.POCKET + Zip.Code + AGG.ASSLT.FIREARM.CITIZEN +
## BURG.FORCE.RES.NIGHT
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE +
## LARCENY.PICK.POCKET + Zip.Code + AGG.ASSLT.FIREARM.CITIZEN
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE +
## LARCENY.PICK.POCKET + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE +
## Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + drug + LOST.PROPERTY + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + drug + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## Zip.Code
##
## Df Sum of Sq RSS AIC
## <none> 5.6217e+14 226377
## - Date.Quarter 1 2.4396e+11 5.6241e+14 226379
## - Baths.All 1 1.5544e+13 5.7771e+14 226623
## - Bedrooms 1 3.3119e+13 5.9529e+14 226896
## - Total.Square.Footage 1 5.1436e+13 6.1361e+14 227172
## - Zip.Code 35 3.7703e+14 9.3920e+14 230981
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## community_facilities_count + drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE +
## LARCENY.PICK.POCKET + Zip.Code + AGG.ASSLT.FIREARM.CITIZEN +
## BURG.FORCE.RES.NIGHT
##
## Final Model:
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## Zip.Code
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 9068 5.621702e+14 226376.6
## 2 - BURG.FORCE.RES.NIGHT 0 0.0000 9068 5.621702e+14 226376.6
## 3 - AGG.ASSLT.FIREARM.CITIZEN 0 0.0000 9068 5.621702e+14 226376.6
## 4 - LARCENY.PICK.POCKET 0 0.1875 9068 5.621702e+14 226376.6
## 5 - VANDALISM.MOTOR.VEHICLE 0 0.0625 9068 5.621702e+14 226376.6
## 6 - LOST.PROPERTY 0 0.0625 9068 5.621702e+14 226376.6
## 7 - drug 0 0.4375 9068 5.621702e+14 226376.6
## 8 - community_facilities_count 0 0.4375 9068 5.621702e+14 226376.6
plot(mylogit)
#
#
#
#
#
#predict house price
devset$predicted_close_price3<-predict(mylogit, devset)
## Warning in predict.lm(mylogit, devset): prediction from a rank-deficient
## fit may be misleading
# how far off are the predicted prices
devset$difference3 <- devset$predicted_close_price3 - devset$Close.Price
devset$percent_error3 <- abs(devset$difference3/devset$Close.Price)
#Model 4
mylogit <- lm(Close.Price ~ median_sales_num + mean_sales_num + Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count + drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET + Zip.Code + ASSAULT...BATTERY...CITIZEN + POL.INFORMATION + DRIVING.UNDER.THE.INFLUENCE + BURG.FORCE.RES.NIGHT, data = trainset)
step<-stepAIC(mylogit, direction = "both")
## Start: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code + ASSAULT...BATTERY...CITIZEN + POL.INFORMATION +
## DRIVING.UNDER.THE.INFLUENCE + BURG.FORCE.RES.NIGHT
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code + ASSAULT...BATTERY...CITIZEN + POL.INFORMATION +
## DRIVING.UNDER.THE.INFLUENCE
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code + ASSAULT...BATTERY...CITIZEN + POL.INFORMATION
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code + ASSAULT...BATTERY...CITIZEN
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + Bedrooms + Total.Square.Footage +
## Baths.All + Date.Quarter + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## Zip.Code
##
## Df Sum of Sq RSS AIC
## <none> 5.6217e+14 226377
## - Date.Quarter 1 2.4396e+11 5.6241e+14 226379
## - Baths.All 1 1.5544e+13 5.7771e+14 226623
## - Bedrooms 1 3.3119e+13 5.9529e+14 226896
## - Total.Square.Footage 1 5.1436e+13 6.1361e+14 227172
## - Zip.Code 35 3.7703e+14 9.3920e+14 230981
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code + ASSAULT...BATTERY...CITIZEN + POL.INFORMATION +
## DRIVING.UNDER.THE.INFLUENCE + BURG.FORCE.RES.NIGHT
##
## Final Model:
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## Zip.Code
##
##
## Step Df Deviance Resid. Df Resid. Dev
## 1 9068 5.621702e+14
## 2 - BURG.FORCE.RES.NIGHT 0 0.0000 9068 5.621702e+14
## 3 - DRIVING.UNDER.THE.INFLUENCE 0 0.0000 9068 5.621702e+14
## 4 - POL.INFORMATION 0 0.0000 9068 5.621702e+14
## 5 - ASSAULT...BATTERY...CITIZEN 0 0.0000 9068 5.621702e+14
## 6 - LARCENY.PICK.POCKET 0 1.1875 9068 5.621702e+14
## 7 - VANDALISM.MOTOR.VEHICLE 0 18.0000 9068 5.621702e+14
## 8 - LOST.PROPERTY 0 13.1875 9068 5.621702e+14
## 9 - drug 0 4.8125 9068 5.621702e+14
## 10 - community_facilities_count 0 0.9375 9068 5.621702e+14
## 11 - mean_sales_num 0 0.3750 9068 5.621702e+14
## 12 - median_sales_num 0 0.5000 9068 5.621702e+14
## AIC
## 1 226376.6
## 2 226376.6
## 3 226376.6
## 4 226376.6
## 5 226376.6
## 6 226376.6
## 7 226376.6
## 8 226376.6
## 9 226376.6
## 10 226376.6
## 11 226376.6
## 12 226376.6
plot(mylogit)
#
#
#
#
#
#predict house price
devset$predicted_close_price4<-predict(mylogit, devset)
## Warning in predict.lm(mylogit, devset): prediction from a rank-deficient
## fit may be misleading
# how far off are the predicted prices
devset$difference4 <- devset$predicted_close_price4 - devset$Close.Price
devset$percent_error4 <- abs(devset$difference4/devset$Close.Price)
#Model 5
mylogit <- lm(Close.Price ~ median_sales_num + mean_sales_num + Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count + drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET + Zip.Code + ASSAULT...BATTERY...CITIZEN + POL.INFORMATION + DRIVING.UNDER.THE.INFLUENCE + BURG.FORCE.RES.NIGHT, data = trainset)
step<-stepAIC(mylogit, direction = "both")
## Start: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code + ASSAULT...BATTERY...CITIZEN + POL.INFORMATION +
## DRIVING.UNDER.THE.INFLUENCE + BURG.FORCE.RES.NIGHT
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code + ASSAULT...BATTERY...CITIZEN + POL.INFORMATION +
## DRIVING.UNDER.THE.INFLUENCE
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code + ASSAULT...BATTERY...CITIZEN + POL.INFORMATION
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code + ASSAULT...BATTERY...CITIZEN
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ median_sales_num + Bedrooms + Total.Square.Footage +
## Baths.All + Date.Quarter + Zip.Code
##
##
## Step: AIC=226376.6
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## Zip.Code
##
## Df Sum of Sq RSS AIC
## <none> 5.6217e+14 226377
## - Date.Quarter 1 2.4396e+11 5.6241e+14 226379
## - Baths.All 1 1.5544e+13 5.7771e+14 226623
## - Bedrooms 1 3.3119e+13 5.9529e+14 226896
## - Total.Square.Footage 1 5.1436e+13 6.1361e+14 227172
## - Zip.Code 35 3.7703e+14 9.3920e+14 230981
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + LOST.PROPERTY + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET +
## Zip.Code + ASSAULT...BATTERY...CITIZEN + POL.INFORMATION +
## DRIVING.UNDER.THE.INFLUENCE + BURG.FORCE.RES.NIGHT
##
## Final Model:
## Close.Price ~ Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter +
## Zip.Code
##
##
## Step Df Deviance Resid. Df Resid. Dev
## 1 9068 5.621702e+14
## 2 - BURG.FORCE.RES.NIGHT 0 0.0000 9068 5.621702e+14
## 3 - DRIVING.UNDER.THE.INFLUENCE 0 0.0000 9068 5.621702e+14
## 4 - POL.INFORMATION 0 0.0000 9068 5.621702e+14
## 5 - ASSAULT...BATTERY...CITIZEN 0 0.0000 9068 5.621702e+14
## 6 - LARCENY.PICK.POCKET 0 1.1875 9068 5.621702e+14
## 7 - VANDALISM.MOTOR.VEHICLE 0 18.0000 9068 5.621702e+14
## 8 - LOST.PROPERTY 0 13.1875 9068 5.621702e+14
## 9 - drug 0 4.8125 9068 5.621702e+14
## 10 - community_facilities_count 0 0.9375 9068 5.621702e+14
## 11 - mean_sales_num 0 0.3750 9068 5.621702e+14
## 12 - median_sales_num 0 0.5000 9068 5.621702e+14
## AIC
## 1 226376.6
## 2 226376.6
## 3 226376.6
## 4 226376.6
## 5 226376.6
## 6 226376.6
## 7 226376.6
## 8 226376.6
## 9 226376.6
## 10 226376.6
## 11 226376.6
## 12 226376.6
plot(mylogit)
#
#
#
#
#
#predict housing price
devset$predicted_close_price5<-predict(mylogit, devset)
## Warning in predict.lm(mylogit, devset): prediction from a rank-deficient
## fit may be misleading
# how far off are the predicted prices
devset$difference5 <- devset$predicted_close_price5 - devset$Close.Price
devset$percent_error5 <- abs(devset$difference5/devset$Close.Price)
#Model 9 - Final Linear Model
#linear model with 3 factors and 4 identified in PCA, AIC 227765.1
#This comes here because some columns were removed for PCA, below.
names(trainset)
## [1] "Zip.Code"
## [2] "ROB.FIREARM...STREET"
## [3] "AGG.ASSLT.FIREARM.CITIZEN"
## [4] "BURG.FORCE.RES.NIGHT"
## [5] "LARCENY.PICK.POCKET"
## [6] "AUTO.THEFT...PASSENGER.VEHICLE"
## [7] "ASSAULT...BATTERY...CITIZEN"
## [8] "VANDALISM.MOTOR.VEHICLE"
## [9] "WEAPON.POSSESSION.HANDGUN"
## [10] "SEX.OFFENSE...SEX..ASSAULT"
## [11] "drug"
## [12] "FAMILY.OFFENSE...ABUSE.CHILD"
## [13] "JUVENILE.RUNAWAY"
## [14] "LIQUOR...UNLAWFUL.POSS.UNDER.21"
## [15] "DISORDERLY.CONDUCT"
## [16] "SUICIDE...POISON.OVERDOSE"
## [17] "LITTERING.TRASH.DUMPING"
## [18] "TRESPASSING"
## [19] "HARASSMENT.STALKING"
## [20] "DRIVING.UNDER.THE.INFLUENCE"
## [21] "FIRE.OTHER"
## [22] "POL.INFORMATION"
## [23] "LOST.PROPERTY"
## [24] "RECOVERED.PROPERTY.MONT..CO."
## [25] "community_facilities_count"
## [26] "Number_of_Sales_2014"
## [27] "Number_of_Crimes_2014"
## [28] "IRS_Estimated_Population_2014"
## [29] "Total_Number_of_Sales_State_Planning"
## [30] "ML."
## [31] "City"
## [32] "List.Price"
## [33] "Original.List.Price"
## [34] "Close.Price"
## [35] "Legal.Subdivision"
## [36] "Status"
## [37] "Date.Quarter"
## [38] "Close.Date"
## [39] "DOMM"
## [40] "DOMP"
## [41] "Baths.All"
## [42] "Bedrooms"
## [43] "Total.Square.Footage"
## [44] "Type.y"
## [45] "median_sales_num"
## [46] "mean_sales_num"
mylogit <- lm(
as.formula(paste(colnames(trainset)[34], "~",
paste(colnames(trainset)[c(2, 17, 4, 3, 41:43)], collapse = "+"),
sep = ""
)),
data=trainset
)
step<-stepAIC(mylogit, direction = "both")
## Start: AIC=229396.2
## Close.Price ~ ROB.FIREARM...STREET + LITTERING.TRASH.DUMPING +
## BURG.FORCE.RES.NIGHT + AGG.ASSLT.FIREARM.CITIZEN + Baths.All +
## Bedrooms + Total.Square.Footage
##
## Df Sum of Sq RSS AIC
## <none> 7.8868e+14 229396
## - BURG.FORCE.RES.NIGHT 1 1.1865e+12 7.8987e+14 229408
## - LITTERING.TRASH.DUMPING 1 3.2671e+12 7.9195e+14 229432
## - ROB.FIREARM...STREET 1 1.0681e+13 7.9936e+14 229517
## - Baths.All 1 2.2729e+13 8.1141e+14 229653
## - Bedrooms 1 4.0714e+13 8.2939e+14 229853
## - Total.Square.Footage 1 5.3457e+13 8.4214e+14 229991
## - AGG.ASSLT.FIREARM.CITIZEN 1 1.1452e+14 9.0319e+14 230629
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Close.Price ~ ROB.FIREARM...STREET + LITTERING.TRASH.DUMPING +
## BURG.FORCE.RES.NIGHT + AGG.ASSLT.FIREARM.CITIZEN + Baths.All +
## Bedrooms + Total.Square.Footage
##
## Final Model:
## Close.Price ~ ROB.FIREARM...STREET + LITTERING.TRASH.DUMPING +
## BURG.FORCE.RES.NIGHT + AGG.ASSLT.FIREARM.CITIZEN + Baths.All +
## Bedrooms + Total.Square.Footage
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 9100 7.886793e+14 229396.2
plot(mylogit)
#
#
#
#
#
#predict housing price
devset$predicted_close_price9<-predict(mylogit, devset)
# how far off are the predicted prices
devset$difference9 <- devset$predicted_close_price9 - devset$Close.Price
devset$percent_error9 <- abs(devset$difference9/devset$Close.Price)
#linear model 6
mylogit <- lm(Close.Price ~ median_sales_num + mean_sales_num + Bedrooms + Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count + drug + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET + ASSAULT...BATTERY...CITIZEN + POL.INFORMATION + DRIVING.UNDER.THE.INFLUENCE + BURG.FORCE.RES.NIGHT, data = trainset)
step<-stepAIC(mylogit, direction = "both")
## Start: AIC=226508.6
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET + ASSAULT...BATTERY...CITIZEN +
## POL.INFORMATION + DRIVING.UNDER.THE.INFLUENCE + BURG.FORCE.RES.NIGHT
##
## Df Sum of Sq RSS AIC
## - community_facilities_count 1 2.3350e+10 5.7354e+14 226507
## - POL.INFORMATION 1 4.6757e+10 5.7356e+14 226507
## - median_sales_num 1 1.2125e+11 5.7364e+14 226509
## - BURG.FORCE.RES.NIGHT 1 1.2138e+11 5.7364e+14 226509
## <none> 5.7352e+14 226509
## - drug 1 1.4201e+11 5.7366e+14 226509
## - Date.Quarter 1 2.9637e+11 5.7381e+14 226511
## - ASSAULT...BATTERY...CITIZEN 1 3.6121e+11 5.7388e+14 226512
## - VANDALISM.MOTOR.VEHICLE 1 9.9670e+11 5.7451e+14 226522
## - DRIVING.UNDER.THE.INFLUENCE 1 1.3390e+12 5.7486e+14 226528
## - LARCENY.PICK.POCKET 1 3.8860e+12 5.7740e+14 226568
## - mean_sales_num 1 7.6426e+12 5.8116e+14 226627
## - Baths.All 1 1.5000e+13 5.8852e+14 226742
## - Bedrooms 1 3.5553e+13 6.0907e+14 227054
## - Total.Square.Footage 1 4.8804e+13 6.2232e+14 227250
##
## Step: AIC=226507
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + drug +
## VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET + ASSAULT...BATTERY...CITIZEN +
## POL.INFORMATION + DRIVING.UNDER.THE.INFLUENCE + BURG.FORCE.RES.NIGHT
##
## Df Sum of Sq RSS AIC
## - POL.INFORMATION 1 4.5254e+10 5.7359e+14 226506
## - BURG.FORCE.RES.NIGHT 1 1.0051e+11 5.7364e+14 226507
## <none> 5.7354e+14 226507
## - drug 1 1.3477e+11 5.7368e+14 226507
## - median_sales_num 1 1.4371e+11 5.7369e+14 226507
## + community_facilities_count 1 2.3350e+10 5.7352e+14 226509
## - Date.Quarter 1 2.9714e+11 5.7384e+14 226510
## - ASSAULT...BATTERY...CITIZEN 1 3.3803e+11 5.7388e+14 226510
## - VANDALISM.MOTOR.VEHICLE 1 9.7958e+11 5.7452e+14 226521
## - DRIVING.UNDER.THE.INFLUENCE 1 1.3613e+12 5.7490e+14 226527
## - LARCENY.PICK.POCKET 1 3.9003e+12 5.7744e+14 226567
## - mean_sales_num 1 8.1235e+12 5.8166e+14 226633
## - Baths.All 1 1.4977e+13 5.8852e+14 226740
## - Bedrooms 1 3.5645e+13 6.0919e+14 227054
## - Total.Square.Footage 1 4.8884e+13 6.2243e+14 227250
##
## Step: AIC=226505.7
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + drug +
## VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET + ASSAULT...BATTERY...CITIZEN +
## DRIVING.UNDER.THE.INFLUENCE + BURG.FORCE.RES.NIGHT
##
## Df Sum of Sq RSS AIC
## - BURG.FORCE.RES.NIGHT 1 8.7713e+10 5.7367e+14 226505
## <none> 5.7359e+14 226506
## + POL.INFORMATION 1 4.5254e+10 5.7354e+14 226507
## + community_facilities_count 1 2.1847e+10 5.7356e+14 226507
## - drug 1 2.7132e+11 5.7386e+14 226508
## - Date.Quarter 1 2.9795e+11 5.7388e+14 226508
## - ASSAULT...BATTERY...CITIZEN 1 3.1027e+11 5.7390e+14 226509
## - median_sales_num 1 3.3385e+11 5.7392e+14 226509
## - VANDALISM.MOTOR.VEHICLE 1 9.8713e+11 5.7457e+14 226519
## - DRIVING.UNDER.THE.INFLUENCE 1 1.3716e+12 5.7496e+14 226525
## - LARCENY.PICK.POCKET 1 3.8629e+12 5.7745e+14 226565
## - mean_sales_num 1 1.1254e+13 5.8484e+14 226681
## - Baths.All 1 1.4935e+13 5.8852e+14 226738
## - Bedrooms 1 3.5943e+13 6.0953e+14 227057
## - Total.Square.Footage 1 4.8889e+13 6.2248e+14 227249
##
## Step: AIC=226505.1
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + drug +
## VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET + ASSAULT...BATTERY...CITIZEN +
## DRIVING.UNDER.THE.INFLUENCE
##
## Df Sum of Sq RSS AIC
## <none> 5.7367e+14 226505
## + BURG.FORCE.RES.NIGHT 1 8.7713e+10 5.7359e+14 226506
## + POL.INFORMATION 1 3.2460e+10 5.7364e+14 226507
## - ASSAULT...BATTERY...CITIZEN 1 2.4462e+11 5.7392e+14 226507
## + community_facilities_count 1 2.6581e+09 5.7367e+14 226507
## - median_sales_num 1 2.6993e+11 5.7394e+14 226507
## - Date.Quarter 1 2.9864e+11 5.7397e+14 226508
## - drug 1 3.5537e+11 5.7403e+14 226509
## - VANDALISM.MOTOR.VEHICLE 1 1.2795e+12 5.7495e+14 226523
## - DRIVING.UNDER.THE.INFLUENCE 1 1.3321e+12 5.7501e+14 226524
## - LARCENY.PICK.POCKET 1 4.2399e+12 5.7791e+14 226570
## - mean_sales_num 1 1.1313e+13 5.8499e+14 226681
## - Baths.All 1 1.4858e+13 5.8853e+14 226736
## - Bedrooms 1 3.6707e+13 6.1038e+14 227068
## - Total.Square.Footage 1 4.8813e+13 6.2249e+14 227247
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + community_facilities_count +
## drug + VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET + ASSAULT...BATTERY...CITIZEN +
## POL.INFORMATION + DRIVING.UNDER.THE.INFLUENCE + BURG.FORCE.RES.NIGHT
##
## Final Model:
## Close.Price ~ median_sales_num + mean_sales_num + Bedrooms +
## Total.Square.Footage + Baths.All + Date.Quarter + drug +
## VANDALISM.MOTOR.VEHICLE + LARCENY.PICK.POCKET + ASSAULT...BATTERY...CITIZEN +
## DRIVING.UNDER.THE.INFLUENCE
##
##
## Step Df Deviance Resid. Df Resid. Dev
## 1 9093 5.735180e+14
## 2 - community_facilities_count 1 23350406474 9094 5.735414e+14
## 3 - POL.INFORMATION 1 45254318174 9095 5.735866e+14
## 4 - BURG.FORCE.RES.NIGHT 1 87712518684 9096 5.736744e+14
## AIC
## 1 226508.6
## 2 226507.0
## 3 226505.7
## 4 226505.1
plot(mylogit)
#
#
#
#
#
#predict house price
devset$predicted_close_price6<-predict(mylogit, devset)
# how far off are the predicted prices
devset$difference6 <- devset$predicted_close_price6 - devset$Close.Price
devset$percent_error6 <- abs(devset$difference6/devset$Close.Price)
#linear model with 3 factors
#PCA
#In order to do PCA, factor columns need to be removed, but they are needed for the lm, above
#rattle was used here to identify the factor columns, quickly
#it is commented out for the RMD
names(full)
## [1] "Zip.Code"
## [2] "ROB.FIREARM...STREET"
## [3] "AGG.ASSLT.FIREARM.CITIZEN"
## [4] "BURG.FORCE.RES.NIGHT"
## [5] "LARCENY.PICK.POCKET"
## [6] "AUTO.THEFT...PASSENGER.VEHICLE"
## [7] "ASSAULT...BATTERY...CITIZEN"
## [8] "VANDALISM.MOTOR.VEHICLE"
## [9] "WEAPON.POSSESSION.HANDGUN"
## [10] "SEX.OFFENSE...SEX..ASSAULT"
## [11] "drug"
## [12] "FAMILY.OFFENSE...ABUSE.CHILD"
## [13] "JUVENILE.RUNAWAY"
## [14] "LIQUOR...UNLAWFUL.POSS.UNDER.21"
## [15] "DISORDERLY.CONDUCT"
## [16] "SUICIDE...POISON.OVERDOSE"
## [17] "LITTERING.TRASH.DUMPING"
## [18] "TRESPASSING"
## [19] "HARASSMENT.STALKING"
## [20] "DRIVING.UNDER.THE.INFLUENCE"
## [21] "FIRE.OTHER"
## [22] "POL.INFORMATION"
## [23] "LOST.PROPERTY"
## [24] "RECOVERED.PROPERTY.MONT..CO."
## [25] "community_facilities_count"
## [26] "Number_of_Sales_2014"
## [27] "Number_of_Crimes_2014"
## [28] "IRS_Estimated_Population_2014"
## [29] "Total_Number_of_Sales_State_Planning"
## [30] "ML."
## [31] "City"
## [32] "List.Price"
## [33] "Original.List.Price"
## [34] "Close.Price"
## [35] "Legal.Subdivision"
## [36] "Status"
## [37] "Date.Quarter"
## [38] "Close.Date"
## [39] "DOMM"
## [40] "DOMP"
## [41] "Baths.All"
## [42] "Bedrooms"
## [43] "Total.Square.Footage"
## [44] "Type.y"
## [45] "median_sales_num"
## [46] "mean_sales_num"
#remove columns that are factors (because something isn't working in PCA)
full1 <- full [, -c(1, 30, 31,35,36,38,44)]
#split dataset to test, train
index <- 1:nrow(full1)
testindex <- sample(index, trunc(length(index)/5))
testset <- full1[testindex,]
trainset <- full1[-testindex,]
#split for development set
index <- 1:nrow(trainset)
devsetindex <- sample(index, trunc(length(index)/5))
devset2 <- full1[devsetindex,]
trainset <- full1[-devsetindex,]
#PCA
#http://www.statmethods.net/advstats/factor.html
#run the Principle Component Analysis
fit <- princomp(trainset, cor=TRUE)
summary(fit) # print variance accounted for
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 3.6347734 2.2972031 1.92555828 1.51413278
## Proportion of Variance 0.3387584 0.1353113 0.09507115 0.05878457
## Cumulative Proportion 0.3387584 0.4740697 0.56914089 0.62792545
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 1.40646368 1.29780863 1.10056719 1.04303378
## Proportion of Variance 0.05072154 0.04318737 0.03105764 0.02789537
## Cumulative Proportion 0.67864699 0.72183436 0.75289200 0.78078737
## Comp.9 Comp.10 Comp.11 Comp.12
## Standard deviation 1.00355642 0.98796539 0.94429259 0.91201272
## Proportion of Variance 0.02582373 0.02502758 0.02286381 0.02132736
## Cumulative Proportion 0.80661110 0.83163868 0.85450249 0.87582986
## Comp.13 Comp.14 Comp.15 Comp.16
## Standard deviation 0.87484682 0.80456184 0.79970691 0.66871176
## Proportion of Variance 0.01962454 0.01659794 0.01639823 0.01146604
## Cumulative Proportion 0.89545439 0.91205234 0.92845057 0.93991661
## Comp.17 Comp.18 Comp.19 Comp.20
## Standard deviation 0.591124046 0.545543644 0.518745719 0.468109715
## Proportion of Variance 0.008959683 0.007631227 0.006899926 0.005618633
## Cumulative Proportion 0.948876290 0.956507517 0.963407443 0.969026077
## Comp.21 Comp.22 Comp.23 Comp.24
## Standard deviation 0.444012954 0.436325690 0.421277196 0.376892053
## Proportion of Variance 0.005055064 0.004881541 0.004550628 0.003642247
## Cumulative Proportion 0.974081141 0.978962682 0.983513310 0.987155556
## Comp.25 Comp.26 Comp.27 Comp.28
## Standard deviation 0.336366492 0.31558132 0.289889123 0.248268529
## Proportion of Variance 0.002901088 0.00255363 0.002154762 0.001580443
## Cumulative Proportion 0.990056644 0.99261027 0.994765036 0.996345478
## Comp.29 Comp.30 Comp.31 Comp.32
## Standard deviation 0.1972720300 0.1849695961 0.1462771888 0.1145508830
## Proportion of Variance 0.0009978527 0.0008772757 0.0005486414 0.0003364591
## Cumulative Proportion 0.9973433310 0.9982206066 0.9987692481 0.9991057072
## Comp.33 Comp.34 Comp.35 Comp.36
## Standard deviation 0.1021310436 0.0907729681 0.0818125483 0.0652921575
## Proportion of Variance 0.0002674551 0.0002112752 0.0001716229 0.0001093094
## Cumulative Proportion 0.9993731623 0.9995844375 0.9997560604 0.9998653698
## Comp.37 Comp.38 Comp.39
## Standard deviation 0.0552646682 4.248429e-02 1.978587e-02
## Proportion of Variance 0.0000783124 4.627988e-05 1.003797e-05
## Cumulative Proportion 0.9999436822 9.999900e-01 1.000000e+00
loadings(fit) # pc loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## ROB.FIREARM...STREET -0.234 0.107 0.150
## AGG.ASSLT.FIREARM.CITIZEN -0.242 0.158 0.157
## BURG.FORCE.RES.NIGHT -0.181 -0.207 -0.134
## LARCENY.PICK.POCKET -0.187 -0.237 -0.127
## AUTO.THEFT...PASSENGER.VEHICLE -0.241
## ASSAULT...BATTERY...CITIZEN -0.260 -0.102
## VANDALISM.MOTOR.VEHICLE -0.211 -0.217
## WEAPON.POSSESSION.HANDGUN -0.200 -0.130 0.140
## SEX.OFFENSE...SEX..ASSAULT -0.157 -0.355
## drug -0.238
## FAMILY.OFFENSE...ABUSE.CHILD -0.176 0.135 -0.118 0.233 0.167
## JUVENILE.RUNAWAY -0.171 0.181 0.143 -0.255
## LIQUOR...UNLAWFUL.POSS.UNDER.21 -0.215 -0.103 0.265
## DISORDERLY.CONDUCT -0.208 -0.183 0.169
## SUICIDE...POISON.OVERDOSE -0.131 -0.344
## LITTERING.TRASH.DUMPING -0.188 -0.131 0.234
## TRESPASSING -0.213 -0.144 0.234
## HARASSMENT.STALKING -0.271 0.117 -0.111 0.208
## DRIVING.UNDER.THE.INFLUENCE -0.153 -0.149 -0.126
## FIRE.OTHER -0.215
## POL.INFORMATION -0.144 -0.121 0.120 -0.118
## LOST.PROPERTY -0.113 -0.172 0.139 -0.240 -0.327
## RECOVERED.PROPERTY.MONT..CO. -0.159 -0.178 -0.138 -0.305
## community_facilities_count 0.168 0.333
## Number_of_Sales_2014 0.193 0.436
## Number_of_Crimes_2014 -0.216 0.287
## IRS_Estimated_Population_2014 0.219 0.405
## Total_Number_of_Sales_State_Planning 0.258 0.393
## List.Price 0.118 -0.270 0.188 0.251
## Original.List.Price 0.118 -0.270 0.188 0.261
## Close.Price 0.118 -0.272 0.185 0.243
## Date.Quarter
## DOMM 0.290 -0.131
## DOMP 0.309 -0.131
## Baths.All 0.336
## Bedrooms -0.112 0.359
## Total.Square.Footage
## median_sales_num 0.162 -0.271 0.140 -0.187
## mean_sales_num 0.171 -0.258 0.140 -0.178
## Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
## ROB.FIREARM...STREET -0.257 -0.143
## AGG.ASSLT.FIREARM.CITIZEN
## BURG.FORCE.RES.NIGHT -0.128 -0.103
## LARCENY.PICK.POCKET
## AUTO.THEFT...PASSENGER.VEHICLE -0.236 -0.110
## ASSAULT...BATTERY...CITIZEN
## VANDALISM.MOTOR.VEHICLE -0.260
## WEAPON.POSSESSION.HANDGUN
## SEX.OFFENSE...SEX..ASSAULT 0.202
## drug 0.151
## FAMILY.OFFENSE...ABUSE.CHILD 0.135 0.203
## JUVENILE.RUNAWAY -0.121 -0.294 -0.135
## LIQUOR...UNLAWFUL.POSS.UNDER.21 0.161 0.161
## DISORDERLY.CONDUCT 0.158
## SUICIDE...POISON.OVERDOSE 0.472 0.207
## LITTERING.TRASH.DUMPING
## TRESPASSING 0.112
## HARASSMENT.STALKING
## DRIVING.UNDER.THE.INFLUENCE 0.216 0.264
## FIRE.OTHER -0.211 -0.194
## POL.INFORMATION 0.392 -0.239 -0.100 -0.119
## LOST.PROPERTY 0.224 -0.242
## RECOVERED.PROPERTY.MONT..CO. -0.140
## community_facilities_count 0.285 0.138 0.105
## Number_of_Sales_2014
## Number_of_Crimes_2014
## IRS_Estimated_Population_2014
## Total_Number_of_Sales_State_Planning
## List.Price -0.124 -0.121 0.138
## Original.List.Price -0.121 0.134
## Close.Price -0.138 -0.120 0.137
## Date.Quarter -0.353 0.469 0.797
## DOMM 0.616
## DOMP 0.600
## Baths.All -0.245 0.326 -0.190
## Bedrooms -0.273 0.318 -0.162
## Total.Square.Footage 0.124 -0.815 0.537
## median_sales_num
## mean_sales_num -0.102
## Comp.11 Comp.12 Comp.13 Comp.14
## ROB.FIREARM...STREET 0.165
## AGG.ASSLT.FIREARM.CITIZEN 0.105 -0.107
## BURG.FORCE.RES.NIGHT -0.144 0.209 -0.147
## LARCENY.PICK.POCKET -0.149 0.129
## AUTO.THEFT...PASSENGER.VEHICLE 0.250
## ASSAULT...BATTERY...CITIZEN
## VANDALISM.MOTOR.VEHICLE -0.165 0.320
## WEAPON.POSSESSION.HANDGUN 0.385 -0.179 -0.145
## SEX.OFFENSE...SEX..ASSAULT 0.126 0.412 0.284
## drug 0.145 0.105 0.179
## FAMILY.OFFENSE...ABUSE.CHILD 0.225
## JUVENILE.RUNAWAY -0.115
## LIQUOR...UNLAWFUL.POSS.UNDER.21 -0.111 -0.122
## DISORDERLY.CONDUCT -0.199 -0.181
## SUICIDE...POISON.OVERDOSE 0.209 -0.215 -0.418 0.113
## LITTERING.TRASH.DUMPING 0.133 -0.173 0.279 -0.385
## TRESPASSING -0.105
## HARASSMENT.STALKING 0.300 -0.202 0.441
## DRIVING.UNDER.THE.INFLUENCE -0.558 -0.140
## FIRE.OTHER -0.261
## POL.INFORMATION 0.209 0.301 0.146
## LOST.PROPERTY 0.120 -0.175
## RECOVERED.PROPERTY.MONT..CO. 0.215 -0.125 -0.218 -0.320
## community_facilities_count 0.156 -0.296 0.391 -0.144
## Number_of_Sales_2014 -0.211
## Number_of_Crimes_2014
## IRS_Estimated_Population_2014
## Total_Number_of_Sales_State_Planning -0.211
## List.Price 0.298
## Original.List.Price 0.285
## Close.Price 0.298
## Date.Quarter
## DOMM -0.117
## DOMP
## Baths.All -0.159 -0.325 -0.139 0.127
## Bedrooms -0.125 -0.344
## Total.Square.Footage 0.137
## median_sales_num -0.148
## mean_sales_num -0.178
## Comp.15 Comp.16 Comp.17 Comp.18
## ROB.FIREARM...STREET -0.102
## AGG.ASSLT.FIREARM.CITIZEN
## BURG.FORCE.RES.NIGHT 0.487
## LARCENY.PICK.POCKET 0.121 -0.187 -0.335
## AUTO.THEFT...PASSENGER.VEHICLE 0.187 -0.209
## ASSAULT...BATTERY...CITIZEN -0.127 0.121
## VANDALISM.MOTOR.VEHICLE -0.187 0.189
## WEAPON.POSSESSION.HANDGUN 0.364 -0.208
## SEX.OFFENSE...SEX..ASSAULT -0.286 0.176 -0.289
## drug -0.195 0.255
## FAMILY.OFFENSE...ABUSE.CHILD 0.230 -0.145
## JUVENILE.RUNAWAY -0.183
## LIQUOR...UNLAWFUL.POSS.UNDER.21 -0.250 -0.169
## DISORDERLY.CONDUCT
## SUICIDE...POISON.OVERDOSE -0.230 0.105 -0.138
## LITTERING.TRASH.DUMPING -0.134 -0.101
## TRESPASSING -0.284 -0.350 0.124
## HARASSMENT.STALKING 0.408 0.424
## DRIVING.UNDER.THE.INFLUENCE 0.311 0.247 0.131
## FIRE.OTHER 0.441 -0.275
## POL.INFORMATION -0.200 0.427
## LOST.PROPERTY 0.119 -0.213
## RECOVERED.PROPERTY.MONT..CO. 0.308
## community_facilities_count 0.103
## Number_of_Sales_2014
## Number_of_Crimes_2014 -0.155
## IRS_Estimated_Population_2014 0.152
## Total_Number_of_Sales_State_Planning
## List.Price
## Original.List.Price
## Close.Price
## Date.Quarter
## DOMM
## DOMP
## Baths.All -0.642
## Bedrooms 0.670
## Total.Square.Footage
## median_sales_num -0.117 -0.241
## mean_sales_num -0.221
## Comp.19 Comp.20 Comp.21 Comp.22
## ROB.FIREARM...STREET -0.107 -0.111
## AGG.ASSLT.FIREARM.CITIZEN -0.158
## BURG.FORCE.RES.NIGHT 0.276 -0.192 0.115
## LARCENY.PICK.POCKET -0.227 0.199 0.103
## AUTO.THEFT...PASSENGER.VEHICLE 0.141
## ASSAULT...BATTERY...CITIZEN -0.205
## VANDALISM.MOTOR.VEHICLE -0.103 -0.125
## WEAPON.POSSESSION.HANDGUN -0.106 -0.253
## SEX.OFFENSE...SEX..ASSAULT 0.368 -0.203 0.104
## drug -0.537 -0.128 -0.214
## FAMILY.OFFENSE...ABUSE.CHILD 0.266 0.153 -0.395
## JUVENILE.RUNAWAY 0.326
## LIQUOR...UNLAWFUL.POSS.UNDER.21 0.217
## DISORDERLY.CONDUCT 0.213 0.201 -0.119
## SUICIDE...POISON.OVERDOSE -0.187 0.188
## LITTERING.TRASH.DUMPING -0.150 -0.405 0.390
## TRESPASSING
## HARASSMENT.STALKING 0.165
## DRIVING.UNDER.THE.INFLUENCE -0.136
## FIRE.OTHER
## POL.INFORMATION 0.252 -0.149 0.265
## LOST.PROPERTY
## RECOVERED.PROPERTY.MONT..CO. 0.193 0.192 -0.124
## community_facilities_count 0.298 -0.147 0.112
## Number_of_Sales_2014 -0.230
## Number_of_Crimes_2014 0.142
## IRS_Estimated_Population_2014 -0.119 -0.158
## Total_Number_of_Sales_State_Planning -0.143 0.131
## List.Price
## Original.List.Price
## Close.Price
## Date.Quarter
## DOMM 0.153 0.638 0.190
## DOMP -0.153 -0.651 -0.201
## Baths.All 0.235
## Bedrooms -0.232
## Total.Square.Footage
## median_sales_num -0.113 -0.324
## mean_sales_num -0.146 -0.357
## Comp.23 Comp.24 Comp.25 Comp.26
## ROB.FIREARM...STREET 0.131 -0.194 0.141 -0.183
## AGG.ASSLT.FIREARM.CITIZEN -0.165 0.132 0.298 0.166
## BURG.FORCE.RES.NIGHT 0.177 0.193
## LARCENY.PICK.POCKET 0.227 0.409 0.380
## AUTO.THEFT...PASSENGER.VEHICLE -0.174 -0.268
## ASSAULT...BATTERY...CITIZEN 0.143 -0.129 0.394
## VANDALISM.MOTOR.VEHICLE 0.545 -0.156
## WEAPON.POSSESSION.HANDGUN -0.356 0.227
## SEX.OFFENSE...SEX..ASSAULT -0.162 -0.147 -0.107
## drug -0.192 -0.131
## FAMILY.OFFENSE...ABUSE.CHILD 0.478 -0.196
## JUVENILE.RUNAWAY -0.103 -0.362 -0.249 0.198
## LIQUOR...UNLAWFUL.POSS.UNDER.21 0.257
## DISORDERLY.CONDUCT -0.297 0.221 -0.357
## SUICIDE...POISON.OVERDOSE -0.125
## LITTERING.TRASH.DUMPING 0.174 -0.104 -0.301
## TRESPASSING 0.134 -0.227
## HARASSMENT.STALKING -0.160 -0.141 0.215
## DRIVING.UNDER.THE.INFLUENCE -0.271 -0.225
## FIRE.OTHER 0.105 0.154 -0.177 -0.182
## POL.INFORMATION -0.106
## LOST.PROPERTY 0.353 0.296
## RECOVERED.PROPERTY.MONT..CO. -0.179 0.157 -0.249
## community_facilities_count 0.194 0.187 -0.321
## Number_of_Sales_2014 0.150
## Number_of_Crimes_2014 -0.189 -0.137 0.237
## IRS_Estimated_Population_2014 -0.102 -0.143
## Total_Number_of_Sales_State_Planning 0.148 -0.131
## List.Price
## Original.List.Price
## Close.Price
## Date.Quarter
## DOMM -0.122
## DOMP 0.102
## Baths.All
## Bedrooms
## Total.Square.Footage
## median_sales_num -0.221
## mean_sales_num -0.108
## Comp.27 Comp.28 Comp.29 Comp.30
## ROB.FIREARM...STREET 0.198 0.212 0.256 -0.495
## AGG.ASSLT.FIREARM.CITIZEN 0.113 -0.593 -0.190 0.174
## BURG.FORCE.RES.NIGHT 0.148 0.250 0.347
## LARCENY.PICK.POCKET 0.129 -0.275
## AUTO.THEFT...PASSENGER.VEHICLE -0.343 0.213 0.154
## ASSAULT...BATTERY...CITIZEN 0.247 -0.250
## VANDALISM.MOTOR.VEHICLE 0.210 -0.331
## WEAPON.POSSESSION.HANDGUN 0.217 0.272
## SEX.OFFENSE...SEX..ASSAULT -0.100
## drug -0.200 0.115 0.230
## FAMILY.OFFENSE...ABUSE.CHILD 0.142 -0.227
## JUVENILE.RUNAWAY 0.403 -0.200 0.126
## LIQUOR...UNLAWFUL.POSS.UNDER.21 -0.115 -0.256
## DISORDERLY.CONDUCT 0.216 0.302 0.187
## SUICIDE...POISON.OVERDOSE -0.163 0.141
## LITTERING.TRASH.DUMPING -0.222
## TRESPASSING 0.166 0.407
## HARASSMENT.STALKING -0.104
## DRIVING.UNDER.THE.INFLUENCE 0.138 -0.103 -0.102
## FIRE.OTHER -0.143 -0.358
## POL.INFORMATION 0.187 -0.153
## LOST.PROPERTY -0.117 0.332
## RECOVERED.PROPERTY.MONT..CO. -0.211 0.109 -0.411
## community_facilities_count 0.105 -0.120 0.236
## Number_of_Sales_2014 0.119
## Number_of_Crimes_2014 -0.403 0.128
## IRS_Estimated_Population_2014 -0.157 -0.209
## Total_Number_of_Sales_State_Planning 0.255 0.177 -0.117 0.199
## List.Price
## Original.List.Price
## Close.Price
## Date.Quarter
## DOMM
## DOMP
## Baths.All
## Bedrooms
## Total.Square.Footage
## median_sales_num
## mean_sales_num 0.230 -0.134 0.124
## Comp.31 Comp.32 Comp.33 Comp.34
## ROB.FIREARM...STREET 0.412 -0.157
## AGG.ASSLT.FIREARM.CITIZEN 0.360 -0.191
## BURG.FORCE.RES.NIGHT 0.170 -0.126
## LARCENY.PICK.POCKET
## AUTO.THEFT...PASSENGER.VEHICLE -0.154 -0.408 0.251
## ASSAULT...BATTERY...CITIZEN 0.302 0.201
## VANDALISM.MOTOR.VEHICLE -0.258
## WEAPON.POSSESSION.HANDGUN -0.219 -0.111
## SEX.OFFENSE...SEX..ASSAULT 0.158 0.102
## drug 0.236
## FAMILY.OFFENSE...ABUSE.CHILD -0.128
## JUVENILE.RUNAWAY -0.239
## LIQUOR...UNLAWFUL.POSS.UNDER.21 -0.458 0.231
## DISORDERLY.CONDUCT 0.286 -0.228 0.121
## SUICIDE...POISON.OVERDOSE
## LITTERING.TRASH.DUMPING -0.111 0.103
## TRESPASSING 0.260 -0.417
## HARASSMENT.STALKING
## DRIVING.UNDER.THE.INFLUENCE
## FIRE.OTHER 0.411
## POL.INFORMATION 0.164
## LOST.PROPERTY -0.371
## RECOVERED.PROPERTY.MONT..CO.
## community_facilities_count 0.190
## Number_of_Sales_2014 -0.224
## Number_of_Crimes_2014 -0.185
## IRS_Estimated_Population_2014 -0.266 -0.414
## Total_Number_of_Sales_State_Planning 0.195 0.287
## List.Price -0.115
## Original.List.Price -0.100 0.750
## Close.Price -0.629
## Date.Quarter
## DOMM
## DOMP
## Baths.All
## Bedrooms
## Total.Square.Footage
## median_sales_num 0.171 0.362
## mean_sales_num -0.139 -0.106 -0.196
## Comp.35 Comp.36 Comp.37 Comp.38
## ROB.FIREARM...STREET
## AGG.ASSLT.FIREARM.CITIZEN -0.180
## BURG.FORCE.RES.NIGHT -0.182 0.113 0.119
## LARCENY.PICK.POCKET -0.112
## AUTO.THEFT...PASSENGER.VEHICLE 0.270
## ASSAULT...BATTERY...CITIZEN 0.361 -0.314
## VANDALISM.MOTOR.VEHICLE 0.118
## WEAPON.POSSESSION.HANDGUN 0.180 -0.115
## SEX.OFFENSE...SEX..ASSAULT
## drug -0.275 0.258
## FAMILY.OFFENSE...ABUSE.CHILD
## JUVENILE.RUNAWAY
## LIQUOR...UNLAWFUL.POSS.UNDER.21 -0.192 0.188 0.269
## DISORDERLY.CONDUCT -0.179 -0.160
## SUICIDE...POISON.OVERDOSE -0.127
## LITTERING.TRASH.DUMPING
## TRESPASSING 0.126 0.119 -0.200
## HARASSMENT.STALKING
## DRIVING.UNDER.THE.INFLUENCE 0.194 -0.208
## FIRE.OTHER -0.126 0.210
## POL.INFORMATION
## LOST.PROPERTY
## RECOVERED.PROPERTY.MONT..CO.
## community_facilities_count
## Number_of_Sales_2014 -0.359 -0.527 -0.309
## Number_of_Crimes_2014 0.214 -0.188
## IRS_Estimated_Population_2014 -0.208 0.295 0.149
## Total_Number_of_Sales_State_Planning 0.398 0.344 0.261
## List.Price -0.802
## Original.List.Price 0.299
## Close.Price 0.509
## Date.Quarter
## DOMM
## DOMP
## Baths.All
## Bedrooms
## Total.Square.Footage
## median_sales_num 0.346 -0.474
## mean_sales_num 0.280 -0.357 0.419
## Comp.39
## ROB.FIREARM...STREET 0.187
## AGG.ASSLT.FIREARM.CITIZEN
## BURG.FORCE.RES.NIGHT 0.139
## LARCENY.PICK.POCKET -0.267
## AUTO.THEFT...PASSENGER.VEHICLE -0.202
## ASSAULT...BATTERY...CITIZEN -0.343
## VANDALISM.MOTOR.VEHICLE
## WEAPON.POSSESSION.HANDGUN
## SEX.OFFENSE...SEX..ASSAULT
## drug
## FAMILY.OFFENSE...ABUSE.CHILD
## JUVENILE.RUNAWAY
## LIQUOR...UNLAWFUL.POSS.UNDER.21 0.211
## DISORDERLY.CONDUCT
## SUICIDE...POISON.OVERDOSE
## LITTERING.TRASH.DUMPING
## TRESPASSING -0.172
## HARASSMENT.STALKING
## DRIVING.UNDER.THE.INFLUENCE
## FIRE.OTHER
## POL.INFORMATION
## LOST.PROPERTY 0.113
## RECOVERED.PROPERTY.MONT..CO.
## community_facilities_count
## Number_of_Sales_2014
## Number_of_Crimes_2014 0.620
## IRS_Estimated_Population_2014 -0.430
## Total_Number_of_Sales_State_Planning
## List.Price
## Original.List.Price
## Close.Price
## Date.Quarter
## DOMM
## DOMP
## Baths.All
## Bedrooms
## Total.Square.Footage
## median_sales_num -0.106
## mean_sales_num
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.026 0.026 0.026 0.026 0.026 0.026 0.026 0.026
## Cumulative Var 0.026 0.051 0.077 0.103 0.128 0.154 0.179 0.205
## Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.026 0.026 0.026 0.026 0.026 0.026 0.026
## Cumulative Var 0.231 0.256 0.282 0.308 0.333 0.359 0.385
## Comp.16 Comp.17 Comp.18 Comp.19 Comp.20 Comp.21 Comp.22
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.026 0.026 0.026 0.026 0.026 0.026 0.026
## Cumulative Var 0.410 0.436 0.462 0.487 0.513 0.538 0.564
## Comp.23 Comp.24 Comp.25 Comp.26 Comp.27 Comp.28 Comp.29
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.026 0.026 0.026 0.026 0.026 0.026 0.026
## Cumulative Var 0.590 0.615 0.641 0.667 0.692 0.718 0.744
## Comp.30 Comp.31 Comp.32 Comp.33 Comp.34 Comp.35 Comp.36
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.026 0.026 0.026 0.026 0.026 0.026 0.026
## Cumulative Var 0.769 0.795 0.821 0.846 0.872 0.897 0.923
## Comp.37 Comp.38 Comp.39
## SS loadings 1.000 1.000 1.000
## Proportion Var 0.026 0.026 0.026
## Cumulative Var 0.949 0.974 1.000
plot(fit,type="lines") # scree plot
#fit$scores # the principal components
#The biplot is messy because there are so many components.
#There is likely a better way to plot the biplot.
biplot(fit)
#which factors are most important? - this was useful
#This provides a ranking of the PCA by index
#The index can then be used to choose factors for the linear model
library(psych)
fit <- principal(trainset, nfactors=37, rotate="varimax")
fit # print results
## Principal Components Analysis
## Call: principal(r = trainset, nfactors = 37, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC12 RC3 RC2 RC5 RC6
## ROB.FIREARM...STREET 0.83 0.28 -0.01 -0.08 0.05 -0.02
## AGG.ASSLT.FIREARM.CITIZEN 0.73 0.53 0.18 -0.18 -0.05 -0.01
## BURG.FORCE.RES.NIGHT 0.52 0.10 -0.35 -0.01 0.30 -0.01
## LARCENY.PICK.POCKET 0.72 -0.11 -0.09 0.06 0.29 -0.03
## AUTO.THEFT...PASSENGER.VEHICLE 0.71 0.30 0.01 -0.10 0.08 -0.01
## ASSAULT...BATTERY...CITIZEN 0.67 0.49 0.10 -0.19 0.23 0.00
## VANDALISM.MOTOR.VEHICLE 0.54 0.22 0.01 -0.07 0.22 -0.03
## WEAPON.POSSESSION.HANDGUN 0.40 0.60 -0.10 -0.17 0.19 -0.02
## SEX.OFFENSE...SEX..ASSAULT 0.18 0.37 0.07 -0.14 0.22 -0.01
## drug 0.58 0.51 0.12 -0.18 0.09 -0.01
## FAMILY.OFFENSE...ABUSE.CHILD 0.42 0.65 -0.02 -0.22 -0.18 0.01
## JUVENILE.RUNAWAY 0.21 0.61 0.17 -0.20 0.19 -0.01
## LIQUOR...UNLAWFUL.POSS.UNDER.21 0.93 0.11 0.04 -0.08 0.02 -0.01
## DISORDERLY.CONDUCT 0.87 0.02 -0.06 -0.02 0.11 -0.02
## SUICIDE...POISON.OVERDOSE 0.07 0.05 -0.17 0.00 0.15 -0.02
## LITTERING.TRASH.DUMPING 0.77 0.10 -0.12 -0.03 0.10 0.00
## TRESPASSING 0.94 0.07 0.00 -0.05 0.16 0.00
## HARASSMENT.STALKING 0.32 -0.31 -0.08 0.19 0.09 -0.02
## DRIVING.UNDER.THE.INFLUENCE 0.38 0.10 -0.02 -0.08 0.19 -0.01
## FIRE.OTHER 0.67 0.26 0.13 -0.06 0.23 -0.01
## POL.INFORMATION 0.42 0.01 0.08 -0.03 0.30 0.01
## LOST.PROPERTY 0.23 -0.10 0.08 0.04 0.86 0.01
## RECOVERED.PROPERTY.MONT..CO. 0.41 0.04 -0.04 0.01 0.78 -0.02
## community_facilities_count 0.05 0.08 0.66 -0.01 -0.06 0.00
## Number_of_Sales_2014 -0.06 -0.06 0.98 -0.01 0.04 0.00
## Number_of_Crimes_2014 0.61 0.21 0.59 -0.10 0.21 -0.01
## IRS_Estimated_Population_2014 0.14 0.13 0.94 -0.07 0.02 0.01
## Total_Number_of_Sales_State_Planning -0.10 0.08 0.97 -0.06 -0.04 0.00
## List.Price -0.09 -0.23 -0.05 0.95 0.01 0.06
## Original.List.Price -0.08 -0.23 -0.05 0.95 0.01 0.10
## Close.Price -0.08 -0.24 -0.05 0.95 0.01 0.04
## Date.Quarter 0.00 0.01 0.00 -0.01 0.00 0.03
## DOMM -0.02 0.00 0.01 0.07 0.00 0.95
## DOMP -0.02 0.00 0.00 0.12 -0.01 0.94
## Baths.All -0.10 -0.05 -0.01 0.29 -0.01 0.04
## Bedrooms -0.07 -0.03 -0.08 0.29 -0.04 0.01
## Total.Square.Footage 0.00 -0.01 0.00 0.02 0.00 0.03
## median_sales_num -0.15 -0.89 -0.09 0.35 0.08 -0.01
## mean_sales_num -0.19 -0.88 -0.07 0.36 0.05 -0.01
## RC8 RC11 RC13 RC10 RC9 RC24
## ROB.FIREARM...STREET -0.03 -0.09 -0.01 0.00 0.00 0.27
## AGG.ASSLT.FIREARM.CITIZEN 0.03 -0.05 0.03 0.00 0.00 0.08
## BURG.FORCE.RES.NIGHT 0.09 0.14 0.08 0.00 0.00 0.23
## LARCENY.PICK.POCKET 0.11 0.28 0.16 0.00 -0.01 0.14
## AUTO.THEFT...PASSENGER.VEHICLE 0.07 0.00 0.18 0.00 0.00 0.34
## ASSAULT...BATTERY...CITIZEN 0.17 0.16 0.16 0.00 0.00 0.18
## VANDALISM.MOTOR.VEHICLE 0.19 0.11 0.21 0.00 0.00 0.69
## WEAPON.POSSESSION.HANDGUN 0.16 -0.05 0.10 0.01 -0.01 -0.06
## SEX.OFFENSE...SEX..ASSAULT 0.13 0.10 0.84 0.01 0.01 0.12
## drug -0.02 0.17 0.16 0.00 0.00 0.17
## FAMILY.OFFENSE...ABUSE.CHILD 0.09 -0.04 0.03 0.00 -0.01 -0.11
## JUVENILE.RUNAWAY 0.03 -0.05 0.14 0.00 0.00 0.26
## LIQUOR...UNLAWFUL.POSS.UNDER.21 0.01 0.16 0.05 0.00 0.01 -0.13
## DISORDERLY.CONDUCT 0.11 0.30 -0.03 -0.01 0.00 0.02
## SUICIDE...POISON.OVERDOSE 0.96 0.09 0.09 -0.01 -0.01 0.07
## LITTERING.TRASH.DUMPING -0.09 -0.06 -0.02 0.01 -0.01 -0.01
## TRESPASSING 0.02 0.02 0.04 0.00 0.00 -0.03
## HARASSMENT.STALKING 0.03 0.00 -0.03 0.00 -0.01 0.00
## DRIVING.UNDER.THE.INFLUENCE 0.11 0.88 0.09 0.00 0.01 0.05
## FIRE.OTHER -0.10 0.15 -0.08 0.01 -0.01 0.13
## POL.INFORMATION -0.03 0.06 0.10 0.01 0.00 0.00
## LOST.PROPERTY 0.03 0.19 0.20 0.01 0.00 0.06
## RECOVERED.PROPERTY.MONT..CO. 0.28 0.03 0.02 0.00 0.00 0.09
## community_facilities_count -0.01 0.03 0.11 0.00 0.00 -0.01
## Number_of_Sales_2014 -0.04 -0.01 -0.02 0.00 0.00 0.01
## Number_of_Crimes_2014 0.00 0.12 0.17 0.00 0.00 0.09
## IRS_Estimated_Population_2014 -0.11 0.01 0.04 0.00 0.00 0.03
## Total_Number_of_Sales_State_Planning -0.04 -0.04 -0.01 0.00 0.00 -0.03
## List.Price 0.00 -0.02 -0.04 -0.01 0.01 -0.02
## Original.List.Price 0.00 -0.02 -0.04 0.00 0.01 -0.01
## Close.Price 0.00 -0.02 -0.04 -0.01 0.01 -0.02
## Date.Quarter -0.01 0.00 0.00 1.00 -0.01 0.00
## DOMM -0.01 -0.01 0.00 0.01 0.02 -0.01
## DOMP -0.02 0.00 0.00 0.02 0.01 0.00
## Baths.All -0.03 -0.03 -0.02 0.00 0.01 -0.02
## Bedrooms -0.03 -0.04 -0.01 0.01 0.00 -0.02
## Total.Square.Footage -0.01 0.00 0.00 -0.01 1.00 0.00
## median_sales_num 0.00 -0.07 -0.12 -0.01 0.00 -0.07
## mean_sales_num 0.02 -0.06 -0.10 -0.01 0.00 -0.04
## RC14 RC17 RC7 RC4 RC20 RC16
## ROB.FIREARM...STREET 0.06 -0.03 0.01 -0.03 -0.11 0.11
## AGG.ASSLT.FIREARM.CITIZEN 0.02 -0.03 0.10 -0.01 0.02 0.10
## BURG.FORCE.RES.NIGHT 0.07 -0.04 0.10 0.00 0.02 0.61
## LARCENY.PICK.POCKET 0.16 -0.03 0.05 -0.06 0.09 0.04
## AUTO.THEFT...PASSENGER.VEHICLE 0.02 -0.04 0.12 -0.04 -0.03 0.28
## ASSAULT...BATTERY...CITIZEN -0.10 -0.02 0.02 -0.02 0.01 0.16
## VANDALISM.MOTOR.VEHICLE 0.00 -0.03 -0.01 -0.04 -0.01 0.14
## WEAPON.POSSESSION.HANDGUN 0.07 -0.02 0.04 0.00 0.02 0.07
## SEX.OFFENSE...SEX..ASSAULT -0.03 -0.02 0.10 -0.01 0.07 0.04
## drug 0.07 0.01 0.21 -0.03 0.05 -0.05
## FAMILY.OFFENSE...ABUSE.CHILD -0.01 -0.02 0.07 0.01 0.13 0.07
## JUVENILE.RUNAWAY -0.22 -0.01 -0.18 -0.03 0.04 0.10
## LIQUOR...UNLAWFUL.POSS.UNDER.21 0.01 -0.04 0.09 -0.01 0.06 -0.02
## DISORDERLY.CONDUCT 0.13 -0.03 0.08 -0.03 0.02 0.00
## SUICIDE...POISON.OVERDOSE 0.02 -0.02 -0.02 -0.03 0.00 0.02
## LITTERING.TRASH.DUMPING 0.02 -0.01 0.03 0.03 0.17 0.18
## TRESPASSING 0.09 -0.02 0.12 -0.02 -0.04 -0.13
## HARASSMENT.STALKING 0.86 0.00 0.07 0.00 0.05 0.03
## DRIVING.UNDER.THE.INFLUENCE -0.01 -0.04 0.05 -0.04 0.01 0.05
## FIRE.OTHER 0.14 -0.04 -0.03 -0.04 -0.15 0.09
## POL.INFORMATION 0.08 0.01 0.83 -0.01 -0.04 0.05
## LOST.PROPERTY 0.06 0.00 0.25 -0.03 -0.03 -0.04
## RECOVERED.PROPERTY.MONT..CO. 0.04 -0.02 0.04 -0.01 0.00 0.20
## community_facilities_count 0.09 -0.01 -0.07 0.02 0.71 0.01
## Number_of_Sales_2014 0.01 -0.01 0.04 -0.04 -0.05 -0.05
## Number_of_Crimes_2014 -0.02 -0.01 0.11 -0.03 0.19 0.07
## IRS_Estimated_Population_2014 -0.06 0.00 0.01 0.00 0.15 0.06
## Total_Number_of_Sales_State_Planning -0.05 0.00 0.02 -0.03 -0.03 -0.10
## List.Price 0.05 0.09 -0.01 0.09 0.00 -0.01
## Original.List.Price 0.05 0.09 -0.01 0.09 0.00 -0.01
## Close.Price 0.05 0.09 -0.01 0.09 -0.01 0.00
## Date.Quarter 0.00 0.00 0.00 0.01 0.00 0.00
## DOMM -0.01 0.01 0.00 0.01 0.00 0.00
## DOMP -0.01 0.03 0.00 0.01 0.00 0.00
## Baths.All 0.00 0.90 0.01 0.30 -0.01 -0.01
## Bedrooms 0.00 0.31 -0.01 0.89 0.01 0.00
## Total.Square.Footage 0.00 0.00 0.00 0.00 0.00 0.00
## median_sales_num 0.12 0.02 0.05 0.02 0.00 0.03
## mean_sales_num 0.17 0.02 -0.03 0.01 0.01 -0.01
## RC15 RC26 RC27 RC22 RC23 RC19
## ROB.FIREARM...STREET 0.05 0.12 0.17 0.00 0.06 0.06
## AGG.ASSLT.FIREARM.CITIZEN 0.13 0.02 0.01 -0.03 0.08 0.07
## BURG.FORCE.RES.NIGHT 0.05 0.05 0.06 0.12 0.05 -0.02
## LARCENY.PICK.POCKET 0.02 0.10 0.06 0.02 -0.02 0.02
## AUTO.THEFT...PASSENGER.VEHICLE 0.00 0.25 0.15 -0.02 -0.03 0.06
## ASSAULT...BATTERY...CITIZEN 0.02 -0.04 0.07 0.02 0.06 0.08
## VANDALISM.MOTOR.VEHICLE -0.04 0.04 0.09 0.00 -0.05 0.03
## WEAPON.POSSESSION.HANDGUN 0.57 0.12 0.02 0.07 0.06 0.05
## SEX.OFFENSE...SEX..ASSAULT 0.03 -0.01 0.04 -0.01 0.01 0.02
## drug 0.13 -0.05 0.06 0.05 0.06 0.42
## FAMILY.OFFENSE...ABUSE.CHILD 0.09 0.03 -0.03 0.02 0.51 0.03
## JUVENILE.RUNAWAY 0.03 0.09 0.53 0.02 -0.03 0.03
## LIQUOR...UNLAWFUL.POSS.UNDER.21 -0.01 -0.13 -0.09 -0.01 0.02 -0.06
## DISORDERLY.CONDUCT 0.06 0.07 0.00 -0.08 -0.01 -0.12
## SUICIDE...POISON.OVERDOSE 0.03 -0.01 0.00 -0.02 0.02 0.00
## LITTERING.TRASH.DUMPING 0.08 0.00 0.02 0.54 0.02 0.03
## TRESPASSING -0.07 -0.08 -0.08 0.02 0.01 0.05
## HARASSMENT.STALKING 0.02 0.03 -0.05 0.01 0.00 0.01
## DRIVING.UNDER.THE.INFLUENCE -0.02 0.02 -0.01 -0.01 -0.01 0.02
## FIRE.OTHER 0.18 0.51 0.09 0.00 0.03 -0.04
## POL.INFORMATION 0.01 0.00 -0.04 0.01 0.02 0.02
## LOST.PROPERTY 0.02 0.05 -0.04 0.04 -0.02 0.03
## RECOVERED.PROPERTY.MONT..CO. 0.05 0.00 0.13 -0.01 -0.04 -0.03
## community_facilities_count 0.01 -0.06 0.01 0.07 0.05 0.01
## Number_of_Sales_2014 -0.02 0.02 -0.03 -0.04 -0.03 -0.01
## Number_of_Crimes_2014 0.03 0.09 0.09 0.01 0.00 0.10
## IRS_Estimated_Population_2014 0.03 0.04 0.05 0.04 0.05 0.06
## Total_Number_of_Sales_State_Planning -0.04 -0.04 0.01 -0.04 -0.03 -0.04
## List.Price -0.02 -0.01 -0.02 0.00 -0.02 -0.01
## Original.List.Price -0.02 -0.01 -0.02 0.00 -0.01 -0.01
## Close.Price -0.02 -0.01 -0.02 0.00 -0.02 -0.01
## Date.Quarter 0.00 0.00 0.00 0.00 0.00 0.00
## DOMM 0.00 -0.01 0.01 0.00 0.01 0.00
## DOMP 0.00 0.00 -0.01 0.00 0.00 0.00
## Baths.All -0.01 -0.01 0.00 0.00 0.00 0.00
## Bedrooms 0.00 -0.01 -0.01 0.01 0.00 0.00
## Total.Square.Footage 0.00 0.00 0.00 0.00 0.00 0.00
## median_sales_num 0.01 -0.01 -0.02 0.00 0.01 0.02
## mean_sales_num 0.02 -0.01 0.02 0.00 0.04 0.02
## RC25 RC21 RC35 RC18 RC29 RC28
## ROB.FIREARM...STREET 0.05 0.00 -0.08 0.01 -0.03 -0.03
## AGG.ASSLT.FIREARM.CITIZEN -0.05 0.01 -0.04 0.00 0.01 0.25
## BURG.FORCE.RES.NIGHT 0.01 0.00 0.00 0.00 0.01 0.01
## LARCENY.PICK.POCKET 0.43 0.00 -0.01 -0.02 0.00 -0.01
## AUTO.THEFT...PASSENGER.VEHICLE 0.06 0.00 -0.04 0.06 -0.02 -0.05
## ASSAULT...BATTERY...CITIZEN -0.02 0.00 0.02 -0.05 0.18 0.03
## VANDALISM.MOTOR.VEHICLE 0.02 0.00 0.01 0.00 0.01 0.01
## WEAPON.POSSESSION.HANDGUN 0.01 0.00 0.01 0.00 0.00 0.01
## SEX.OFFENSE...SEX..ASSAULT 0.02 0.00 0.00 0.00 0.00 0.00
## drug 0.02 0.00 -0.02 -0.01 0.01 0.01
## FAMILY.OFFENSE...ABUSE.CHILD -0.01 0.00 0.00 0.00 0.00 0.01
## JUVENILE.RUNAWAY 0.03 0.00 0.00 0.01 0.01 0.00
## LIQUOR...UNLAWFUL.POSS.UNDER.21 -0.04 0.00 -0.06 0.02 0.03 0.03
## DISORDERLY.CONDUCT -0.03 0.00 0.25 0.00 0.03 -0.04
## SUICIDE...POISON.OVERDOSE 0.01 0.00 0.00 0.00 0.00 0.00
## LITTERING.TRASH.DUMPING 0.01 0.00 -0.01 0.00 0.00 0.00
## TRESPASSING -0.06 0.00 -0.04 -0.02 -0.08 -0.04
## HARASSMENT.STALKING 0.02 0.00 0.00 0.00 0.00 0.00
## DRIVING.UNDER.THE.INFLUENCE 0.02 0.00 0.00 0.00 0.00 0.00
## FIRE.OTHER 0.05 0.00 0.01 -0.01 0.01 0.01
## POL.INFORMATION 0.01 0.00 0.00 0.00 0.00 0.00
## LOST.PROPERTY 0.09 0.00 0.00 -0.16 0.00 0.00
## RECOVERED.PROPERTY.MONT..CO. -0.08 0.00 0.00 0.23 0.00 0.00
## community_facilities_count 0.02 0.00 0.00 0.00 0.00 0.00
## Number_of_Sales_2014 0.00 0.00 -0.04 -0.02 -0.05 -0.02
## Number_of_Crimes_2014 0.10 0.00 0.09 0.05 0.16 -0.01
## IRS_Estimated_Population_2014 -0.05 0.00 0.03 0.02 0.11 0.03
## Total_Number_of_Sales_State_Planning 0.01 0.00 0.00 -0.01 -0.07 0.00
## List.Price 0.00 0.01 0.00 0.00 0.00 0.00
## Original.List.Price 0.00 -0.01 0.00 0.00 0.00 0.00
## Close.Price 0.01 0.01 0.00 0.00 0.00 0.00
## Date.Quarter 0.00 0.00 0.00 0.00 0.00 0.00
## DOMM 0.00 -0.31 0.00 0.00 0.00 0.00
## DOMP -0.01 0.32 0.00 0.00 0.00 0.00
## Baths.All 0.00 0.00 0.00 0.00 0.00 0.00
## Bedrooms -0.01 0.00 0.00 0.00 0.00 0.00
## Total.Square.Footage 0.00 0.00 0.00 0.00 0.00 0.00
## median_sales_num 0.00 0.00 0.02 0.00 0.03 0.01
## mean_sales_num 0.02 0.00 -0.02 -0.02 -0.02 0.02
## RC30 RC32 RC31 RC33 RC34 RC36
## ROB.FIREARM...STREET 0.22 0.03 0.00 0.00 0.00 0.00
## AGG.ASSLT.FIREARM.CITIZEN -0.02 -0.02 0.00 0.00 0.00 0.00
## BURG.FORCE.RES.NIGHT 0.00 -0.01 0.00 0.00 0.00 0.00
## LARCENY.PICK.POCKET 0.01 0.01 0.00 0.00 0.00 0.00
## AUTO.THEFT...PASSENGER.VEHICLE 0.05 0.18 0.00 0.00 0.00 0.00
## ASSAULT...BATTERY...CITIZEN -0.02 -0.06 0.02 -0.03 0.00 -0.01
## VANDALISM.MOTOR.VEHICLE 0.00 -0.01 0.00 0.00 0.00 0.00
## WEAPON.POSSESSION.HANDGUN 0.00 0.00 0.00 0.00 0.00 0.00
## SEX.OFFENSE...SEX..ASSAULT 0.00 0.00 0.00 0.00 0.00 0.00
## drug 0.01 0.00 0.00 0.00 0.00 0.00
## FAMILY.OFFENSE...ABUSE.CHILD 0.00 0.00 0.00 0.00 0.00 0.00
## JUVENILE.RUNAWAY 0.01 0.00 0.00 0.00 0.00 0.00
## LIQUOR...UNLAWFUL.POSS.UNDER.21 -0.05 -0.01 0.08 0.01 0.00 0.00
## DISORDERLY.CONDUCT -0.04 -0.02 0.00 0.00 0.00 0.00
## SUICIDE...POISON.OVERDOSE 0.00 0.00 0.00 0.00 0.00 0.00
## LITTERING.TRASH.DUMPING 0.00 0.00 0.00 0.00 0.00 0.00
## TRESPASSING -0.04 -0.02 -0.08 0.00 0.00 0.00
## HARASSMENT.STALKING 0.00 0.00 0.00 0.00 0.00 0.00
## DRIVING.UNDER.THE.INFLUENCE 0.00 0.00 0.00 0.00 0.00 0.00
## FIRE.OTHER 0.00 -0.01 0.00 0.00 0.00 0.00
## POL.INFORMATION 0.00 0.00 0.00 0.00 0.00 0.00
## LOST.PROPERTY -0.01 -0.01 0.00 0.00 0.00 0.00
## RECOVERED.PROPERTY.MONT..CO. 0.01 0.02 0.00 0.00 0.00 0.00
## community_facilities_count 0.00 0.00 0.00 0.00 0.00 0.00
## Number_of_Sales_2014 0.00 0.00 0.01 -0.01 0.00 -0.05
## Number_of_Crimes_2014 -0.06 0.03 -0.02 0.04 0.00 0.01
## IRS_Estimated_Population_2014 -0.01 0.01 0.01 0.08 0.00 0.02
## Total_Number_of_Sales_State_Planning 0.01 -0.01 -0.01 -0.07 0.00 0.04
## List.Price 0.00 0.00 0.00 0.00 -0.01 0.00
## Original.List.Price 0.00 0.00 0.00 0.00 0.07 0.00
## Close.Price 0.00 0.00 0.00 0.00 -0.06 0.00
## Date.Quarter 0.00 0.00 0.00 0.00 0.00 0.00
## DOMM 0.00 0.00 0.00 0.00 0.00 0.00
## DOMP 0.00 0.00 0.00 0.00 0.00 0.00
## Baths.All 0.00 0.00 0.00 0.00 0.00 0.00
## Bedrooms 0.00 0.00 0.00 0.00 0.00 0.00
## Total.Square.Footage 0.00 0.00 0.00 0.00 0.00 0.00
## median_sales_num 0.00 -0.01 0.02 -0.01 0.00 0.00
## mean_sales_num -0.01 0.00 -0.02 0.01 0.00 0.00
## RC37 h2 u2 com
## ROB.FIREARM...STREET 0.00 1 1.4e-05 2.1
## AGG.ASSLT.FIREARM.CITIZEN 0.00 1 8.8e-07 2.8
## BURG.FORCE.RES.NIGHT 0.00 1 7.6e-06 4.1
## LARCENY.PICK.POCKET 0.00 1 2.8e-05 3.2
## AUTO.THEFT...PASSENGER.VEHICLE 0.00 1 1.6e-05 3.4
## ASSAULT...BATTERY...CITIZEN 0.00 1 4.6e-05 3.7
## VANDALISM.MOTOR.VEHICLE 0.00 1 1.5e-06 3.1
## WEAPON.POSSESSION.HANDGUN 0.00 1 1.5e-06 3.8
## SEX.OFFENSE...SEX..ASSAULT 0.00 1 2.1e-08 2.0
## drug 0.00 1 6.5e-07 4.6
## FAMILY.OFFENSE...ABUSE.CHILD 0.00 1 1.8e-07 3.6
## JUVENILE.RUNAWAY 0.00 1 3.5e-07 4.3
## LIQUOR...UNLAWFUL.POSS.UNDER.21 0.00 1 1.8e-05 1.3
## DISORDERLY.CONDUCT 0.00 1 2.4e-06 1.7
## SUICIDE...POISON.OVERDOSE 0.00 1 2.5e-06 1.2
## LITTERING.TRASH.DUMPING 0.00 1 4.7e-08 2.3
## TRESPASSING 0.00 1 1.3e-05 1.3
## HARASSMENT.STALKING 0.00 1 1.1e-07 1.8
## DRIVING.UNDER.THE.INFLUENCE 0.00 1 5.1e-07 1.6
## FIRE.OTHER 0.00 1 8.6e-07 3.6
## POL.INFORMATION 0.00 1 1.7e-06 1.9
## LOST.PROPERTY 0.00 1 5.0e-06 1.8
## RECOVERED.PROPERTY.MONT..CO. 0.00 1 3.1e-06 2.4
## community_facilities_count 0.00 1 6.3e-07 2.2
## Number_of_Sales_2014 0.00 1 2.0e-06 1.1
## Number_of_Crimes_2014 -0.01 1 1.5e-04 3.8
## IRS_Estimated_Population_2014 0.00 1 7.3e-05 1.3
## Total_Number_of_Sales_State_Planning 0.00 1 1.1e-06 1.1
## List.Price 0.00 1 1.2e-03 1.2
## Original.List.Price 0.00 1 1.6e-04 1.2
## Close.Price 0.00 1 4.7e-04 1.2
## Date.Quarter 0.00 1 3.6e-10 1.0
## DOMM 0.00 1 1.5e-07 1.2
## DOMP 0.00 1 1.1e-07 1.3
## Baths.All 0.00 1 7.1e-10 1.5
## Bedrooms 0.00 1 3.1e-08 1.5
## Total.Square.Footage 0.00 1 1.5e-11 1.0
## median_sales_num -0.05 1 8.7e-06 1.5
## mean_sales_num 0.05 1 5.6e-06 1.6
##
## RC1 RC12 RC3 RC2 RC5 RC6 RC8 RC11 RC13 RC10
## SS loadings 8.71 4.33 3.91 3.49 2.11 1.80 1.21 1.18 1.04 1.00
## Proportion Var 0.22 0.11 0.10 0.09 0.05 0.05 0.03 0.03 0.03 0.03
## Cumulative Var 0.22 0.33 0.43 0.52 0.58 0.62 0.66 0.69 0.71 0.74
## Proportion Explained 0.22 0.11 0.10 0.09 0.05 0.05 0.03 0.03 0.03 0.03
## Cumulative Proportion 0.22 0.33 0.43 0.52 0.58 0.62 0.66 0.69 0.71 0.74
## RC9 RC24 RC14 RC17 RC7 RC4 RC20 RC16 RC15 RC26
## SS loadings 1.00 0.97 0.97 0.95 0.95 0.94 0.68 0.67 0.43 0.43
## Proportion Var 0.03 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.01 0.01
## Cumulative Var 0.76 0.79 0.81 0.84 0.86 0.89 0.90 0.92 0.93 0.94
## Proportion Explained 0.03 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.01 0.01
## Cumulative Proportion 0.76 0.79 0.81 0.84 0.86 0.89 0.90 0.92 0.93 0.94
## RC27 RC22 RC23 RC19 RC25 RC21 RC35 RC18 RC29 RC28
## SS loadings 0.42 0.34 0.31 0.24 0.23 0.20 0.09 0.09 0.09 0.07
## Proportion Var 0.01 0.01 0.01 0.01 0.01 0.01 0.00 0.00 0.00 0.00
## Cumulative Var 0.95 0.96 0.97 0.98 0.98 0.99 0.99 0.99 0.99 1.00
## Proportion Explained 0.01 0.01 0.01 0.01 0.01 0.01 0.00 0.00 0.00 0.00
## Cumulative Proportion 0.95 0.96 0.97 0.98 0.98 0.99 0.99 0.99 0.99 1.00
## RC30 RC32 RC31 RC33 RC34 RC36 RC37
## SS loadings 0.06 0.04 0.01 0.01 0.01 0.01 0
## Proportion Var 0.00 0.00 0.00 0.00 0.00 0.00 0
## Cumulative Var 1.00 1.00 1.00 1.00 1.00 1.00 1
## Proportion Explained 0.00 0.00 0.00 0.00 0.00 0.00 0
## Cumulative Proportion 1.00 1.00 1.00 1.00 1.00 1.00 1
##
## Mean item complexity = 2.2
## Test of the hypothesis that 37 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0
## with the empirical chi square 0.02 with prob < NA
##
## Fit based upon off diagonal values = 1
#linear model with numeric data from top PCA factors - Model 7
#final model AIC 232728.8 - not an improvement over baseline
mylogit <- lm(
as.formula(paste(colnames(trainset)[31], "~",
paste(colnames(trainset)[c(1,16,3,2,5,6,8,10,13,24)], collapse = "+"),
sep = ""
)),
data=trainset
)
step<-stepAIC(mylogit, direction = "both")
## Start: AIC=232747.2
## Close.Price ~ ROB.FIREARM...STREET + LITTERING.TRASH.DUMPING +
## BURG.FORCE.RES.NIGHT + AGG.ASSLT.FIREARM.CITIZEN + AUTO.THEFT...PASSENGER.VEHICLE +
## ASSAULT...BATTERY...CITIZEN + WEAPON.POSSESSION.HANDGUN +
## drug + LIQUOR...UNLAWFUL.POSS.UNDER.21 + community_facilities_count
##
## Df Sum of Sq RSS AIC
## - drug 1 7.1020e+10 1.1388e+15 232746
## <none> 1.1387e+15 232747
## - LITTERING.TRASH.DUMPING 1 1.6920e+12 1.1404e+15 232759
## - WEAPON.POSSESSION.HANDGUN 1 3.2981e+12 1.1420e+15 232772
## - AUTO.THEFT...PASSENGER.VEHICLE 1 4.0868e+12 1.1428e+15 232778
## - community_facilities_count 1 1.2830e+13 1.1515e+15 232847
## - ROB.FIREARM...STREET 1 1.3874e+13 1.1526e+15 232856
## - AGG.ASSLT.FIREARM.CITIZEN 1 1.4324e+13 1.1530e+15 232859
## - LIQUOR...UNLAWFUL.POSS.UNDER.21 1 1.7837e+13 1.1565e+15 232887
## - BURG.FORCE.RES.NIGHT 1 3.3992e+13 1.1727e+15 233013
## - ASSAULT...BATTERY...CITIZEN 1 6.0330e+13 1.1990e+15 233215
##
## Step: AIC=232745.8
## Close.Price ~ ROB.FIREARM...STREET + LITTERING.TRASH.DUMPING +
## BURG.FORCE.RES.NIGHT + AGG.ASSLT.FIREARM.CITIZEN + AUTO.THEFT...PASSENGER.VEHICLE +
## ASSAULT...BATTERY...CITIZEN + WEAPON.POSSESSION.HANDGUN +
## LIQUOR...UNLAWFUL.POSS.UNDER.21 + community_facilities_count
##
## Df Sum of Sq RSS AIC
## <none> 1.1388e+15 232746
## + drug 1 7.1020e+10 1.1387e+15 232747
## - LITTERING.TRASH.DUMPING 1 1.7932e+12 1.1405e+15 232758
## - WEAPON.POSSESSION.HANDGUN 1 3.6474e+12 1.1424e+15 232773
## - AUTO.THEFT...PASSENGER.VEHICLE 1 4.3407e+12 1.1431e+15 232778
## - community_facilities_count 1 1.2782e+13 1.1515e+15 232845
## - ROB.FIREARM...STREET 1 1.3849e+13 1.1526e+15 232854
## - AGG.ASSLT.FIREARM.CITIZEN 1 1.4856e+13 1.1536e+15 232862
## - LIQUOR...UNLAWFUL.POSS.UNDER.21 1 1.7975e+13 1.1567e+15 232886
## - BURG.FORCE.RES.NIGHT 1 3.7626e+13 1.1764e+15 233040
## - ASSAULT...BATTERY...CITIZEN 1 7.5101e+13 1.2139e+15 233325
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Close.Price ~ ROB.FIREARM...STREET + LITTERING.TRASH.DUMPING +
## BURG.FORCE.RES.NIGHT + AGG.ASSLT.FIREARM.CITIZEN + AUTO.THEFT...PASSENGER.VEHICLE +
## ASSAULT...BATTERY...CITIZEN + WEAPON.POSSESSION.HANDGUN +
## drug + LIQUOR...UNLAWFUL.POSS.UNDER.21 + community_facilities_count
##
## Final Model:
## Close.Price ~ ROB.FIREARM...STREET + LITTERING.TRASH.DUMPING +
## BURG.FORCE.RES.NIGHT + AGG.ASSLT.FIREARM.CITIZEN + AUTO.THEFT...PASSENGER.VEHICLE +
## ASSAULT...BATTERY...CITIZEN + WEAPON.POSSESSION.HANDGUN +
## LIQUOR...UNLAWFUL.POSS.UNDER.21 + community_facilities_count
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 9097 1.138679e+15 232747.2
## 2 - drug 1 71019914818 9098 1.138750e+15 232745.8
plot(mylogit)
#
#
#
#
#
#predict house price
devset2$predicted_close_price7<-predict(mylogit, devset2)
# how far off are the predicted prices
devset2$difference7 <- devset2$predicted_close_price7 - devset2$Close.Price
devset2$percent_error7 <- abs(devset2$difference7/devset2$Close.Price)
#see the indices for creating the model by index
names(trainset)
## [1] "ROB.FIREARM...STREET"
## [2] "AGG.ASSLT.FIREARM.CITIZEN"
## [3] "BURG.FORCE.RES.NIGHT"
## [4] "LARCENY.PICK.POCKET"
## [5] "AUTO.THEFT...PASSENGER.VEHICLE"
## [6] "ASSAULT...BATTERY...CITIZEN"
## [7] "VANDALISM.MOTOR.VEHICLE"
## [8] "WEAPON.POSSESSION.HANDGUN"
## [9] "SEX.OFFENSE...SEX..ASSAULT"
## [10] "drug"
## [11] "FAMILY.OFFENSE...ABUSE.CHILD"
## [12] "JUVENILE.RUNAWAY"
## [13] "LIQUOR...UNLAWFUL.POSS.UNDER.21"
## [14] "DISORDERLY.CONDUCT"
## [15] "SUICIDE...POISON.OVERDOSE"
## [16] "LITTERING.TRASH.DUMPING"
## [17] "TRESPASSING"
## [18] "HARASSMENT.STALKING"
## [19] "DRIVING.UNDER.THE.INFLUENCE"
## [20] "FIRE.OTHER"
## [21] "POL.INFORMATION"
## [22] "LOST.PROPERTY"
## [23] "RECOVERED.PROPERTY.MONT..CO."
## [24] "community_facilities_count"
## [25] "Number_of_Sales_2014"
## [26] "Number_of_Crimes_2014"
## [27] "IRS_Estimated_Population_2014"
## [28] "Total_Number_of_Sales_State_Planning"
## [29] "List.Price"
## [30] "Original.List.Price"
## [31] "Close.Price"
## [32] "Date.Quarter"
## [33] "DOMM"
## [34] "DOMP"
## [35] "Baths.All"
## [36] "Bedrooms"
## [37] "Total.Square.Footage"
## [38] "median_sales_num"
## [39] "mean_sales_num"
#what if we use the PCA to give us a model
#model with PCA columns
mylogit <- lm(
as.formula(paste(colnames(trainset)[29], "~",
paste(colnames(trainset)[c(1, 11, 3, 2)], collapse = "+"),
sep = ""
)),
data=trainset
)
step<-stepAIC(mylogit, direction = "both")
## Start: AIC=234184.9
## List.Price ~ ROB.FIREARM...STREET + FAMILY.OFFENSE...ABUSE.CHILD +
## BURG.FORCE.RES.NIGHT + AGG.ASSLT.FIREARM.CITIZEN
##
## Df Sum of Sq RSS AIC
## <none> 1.3351e+15 234185
## - BURG.FORCE.RES.NIGHT 1 3.6885e+12 1.3388e+15 234208
## - ROB.FIREARM...STREET 1 5.4312e+12 1.3406e+15 234220
## - AGG.ASSLT.FIREARM.CITIZEN 1 2.8118e+13 1.3633e+15 234373
## - FAMILY.OFFENSE...ABUSE.CHILD 1 4.4437e+13 1.3796e+15 234481
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## List.Price ~ ROB.FIREARM...STREET + FAMILY.OFFENSE...ABUSE.CHILD +
## BURG.FORCE.RES.NIGHT + AGG.ASSLT.FIREARM.CITIZEN
##
## Final Model:
## List.Price ~ ROB.FIREARM...STREET + FAMILY.OFFENSE...ABUSE.CHILD +
## BURG.FORCE.RES.NIGHT + AGG.ASSLT.FIREARM.CITIZEN
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 9103 1.335134e+15 234184.9
plot(mylogit)
#
#
#
#
#
#predict house price
devset2$predicted_close_price8<-predict(mylogit, devset2)
# how far off are the predicted prices
devset2$difference8 <- devset2$predicted_close_price8 - devset2$Close.Price
devset2$percent_error8 <- abs(devset2$difference8/devset2$Close.Price)
#Non Graphical Solutions to Scree Test
#need more understanding of this concept in order to interpret this plot
library(nFactors)
ev <- eigen(cor(trainset)) # get eigenvalues
ap <- parallel(subject=nrow(trainset),var=ncol(trainset),
rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)
#run PCA again but with fewer factors to get a biplot that is readable
new_trainset <- (trainset)[c(1, 11, 3, 2, 29)]
fit <- princomp(new_trainset, cor=TRUE)
summary(fit) # print variance accounted for
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.712794 1.0099408 0.7672067 0.58367303 0.34216120
## Proportion of Variance 0.586733 0.2039961 0.1177212 0.06813484 0.02341486
## Cumulative Proportion 0.586733 0.7907291 0.9084503 0.97658514 1.00000000
#loadings(fit) # pc loadings
plot(fit,type="lines") # scree plot
#fit$scores # the principal components
biplot(fit)
#attempt at more readable biplot, did not work
biplot(fit, expand=10, xlim=c(-0.30, 0.0), ylim=c(-0.1, 0.1))
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.